ChangeLog fix
[emacs.git] / src / coding.c
blob8dc39e7442293d41a62f4334265b6c7f2a970c2e
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2015 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1 \
399 : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
400 #define CODING_ISO_CMP_STATUS(coding) \
401 (&(coding)->spec.iso_2022.cmp_status)
402 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
403 ((coding)->spec.iso_2022.ctext_extended_segment_len)
404 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
405 ((coding)->spec.iso_2022.embedded_utf_8)
407 /* Control characters of ISO2022. */
408 /* code */ /* function */
409 #define ISO_CODE_SO 0x0E /* shift-out */
410 #define ISO_CODE_SI 0x0F /* shift-in */
411 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
412 #define ISO_CODE_ESC 0x1B /* escape */
413 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
414 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
415 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
417 /* All code (1-byte) of ISO2022 is classified into one of the
418 followings. */
419 enum iso_code_class_type
421 ISO_control_0, /* Control codes in the range
422 0x00..0x1F and 0x7F, except for the
423 following 5 codes. */
424 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
425 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
426 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
427 ISO_escape, /* ISO_CODE_ESC (0x1B) */
428 ISO_control_1, /* Control codes in the range
429 0x80..0x9F, except for the
430 following 3 codes. */
431 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
432 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
433 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
434 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
435 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
436 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
437 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
440 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
441 `iso-flags' attribute of an iso2022 coding system. */
443 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
444 instead of the correct short-form sequence (e.g. ESC $ A). */
445 #define CODING_ISO_FLAG_LONG_FORM 0x0001
447 /* If set, reset graphic planes and registers at end-of-line to the
448 initial state. */
449 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
451 /* If set, reset graphic planes and registers before any control
452 characters to the initial state. */
453 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
455 /* If set, encode by 7-bit environment. */
456 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
458 /* If set, use locking-shift function. */
459 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
461 /* If set, use single-shift function. Overwrite
462 CODING_ISO_FLAG_LOCKING_SHIFT. */
463 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
465 /* If set, use designation escape sequence. */
466 #define CODING_ISO_FLAG_DESIGNATION 0x0040
468 /* If set, produce revision number sequence. */
469 #define CODING_ISO_FLAG_REVISION 0x0080
471 /* If set, produce ISO6429's direction specifying sequence. */
472 #define CODING_ISO_FLAG_DIRECTION 0x0100
474 /* If set, assume designation states are reset at beginning of line on
475 output. */
476 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
478 /* If set, designation sequence should be placed at beginning of line
479 on output. */
480 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
482 /* If set, do not encode unsafe characters on output. */
483 #define CODING_ISO_FLAG_SAFE 0x0800
485 /* If set, extra latin codes (128..159) are accepted as a valid code
486 on input. */
487 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
489 #define CODING_ISO_FLAG_COMPOSITION 0x2000
491 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
493 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
495 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
497 #define CODING_ISO_FLAG_LEVEL_4 0x20000
499 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
501 /* A character to be produced on output if encoding of the original
502 character is prohibited by CODING_ISO_FLAG_SAFE. */
503 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
505 /* UTF-8 section */
506 #define CODING_UTF_8_BOM(coding) \
507 ((coding)->spec.utf_8_bom)
509 /* UTF-16 section */
510 #define CODING_UTF_16_BOM(coding) \
511 ((coding)->spec.utf_16.bom)
513 #define CODING_UTF_16_ENDIAN(coding) \
514 ((coding)->spec.utf_16.endian)
516 #define CODING_UTF_16_SURROGATE(coding) \
517 ((coding)->spec.utf_16.surrogate)
520 /* CCL section */
521 #define CODING_CCL_DECODER(coding) \
522 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
523 #define CODING_CCL_ENCODER(coding) \
524 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
525 #define CODING_CCL_VALIDS(coding) \
526 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
528 /* Index for each coding category in `coding_categories' */
530 enum coding_category
532 coding_category_iso_7,
533 coding_category_iso_7_tight,
534 coding_category_iso_8_1,
535 coding_category_iso_8_2,
536 coding_category_iso_7_else,
537 coding_category_iso_8_else,
538 coding_category_utf_8_auto,
539 coding_category_utf_8_nosig,
540 coding_category_utf_8_sig,
541 coding_category_utf_16_auto,
542 coding_category_utf_16_be,
543 coding_category_utf_16_le,
544 coding_category_utf_16_be_nosig,
545 coding_category_utf_16_le_nosig,
546 coding_category_charset,
547 coding_category_sjis,
548 coding_category_big5,
549 coding_category_ccl,
550 coding_category_emacs_mule,
551 /* All above are targets of code detection. */
552 coding_category_raw_text,
553 coding_category_undecided,
554 coding_category_max
557 /* Definitions of flag bits used in detect_coding_XXXX. */
558 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
559 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
560 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
561 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
562 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
563 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
564 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
565 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
566 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
567 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
568 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
569 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
570 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
571 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
572 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
573 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
574 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
575 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
576 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
577 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
579 /* This value is returned if detect_coding_mask () find nothing other
580 than ASCII characters. */
581 #define CATEGORY_MASK_ANY \
582 (CATEGORY_MASK_ISO_7 \
583 | CATEGORY_MASK_ISO_7_TIGHT \
584 | CATEGORY_MASK_ISO_8_1 \
585 | CATEGORY_MASK_ISO_8_2 \
586 | CATEGORY_MASK_ISO_7_ELSE \
587 | CATEGORY_MASK_ISO_8_ELSE \
588 | CATEGORY_MASK_UTF_8_AUTO \
589 | CATEGORY_MASK_UTF_8_NOSIG \
590 | CATEGORY_MASK_UTF_8_SIG \
591 | CATEGORY_MASK_UTF_16_AUTO \
592 | CATEGORY_MASK_UTF_16_BE \
593 | CATEGORY_MASK_UTF_16_LE \
594 | CATEGORY_MASK_UTF_16_BE_NOSIG \
595 | CATEGORY_MASK_UTF_16_LE_NOSIG \
596 | CATEGORY_MASK_CHARSET \
597 | CATEGORY_MASK_SJIS \
598 | CATEGORY_MASK_BIG5 \
599 | CATEGORY_MASK_CCL \
600 | CATEGORY_MASK_EMACS_MULE)
603 #define CATEGORY_MASK_ISO_7BIT \
604 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
606 #define CATEGORY_MASK_ISO_8BIT \
607 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
609 #define CATEGORY_MASK_ISO_ELSE \
610 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
612 #define CATEGORY_MASK_ISO_ESCAPE \
613 (CATEGORY_MASK_ISO_7 \
614 | CATEGORY_MASK_ISO_7_TIGHT \
615 | CATEGORY_MASK_ISO_7_ELSE \
616 | CATEGORY_MASK_ISO_8_ELSE)
618 #define CATEGORY_MASK_ISO \
619 ( CATEGORY_MASK_ISO_7BIT \
620 | CATEGORY_MASK_ISO_8BIT \
621 | CATEGORY_MASK_ISO_ELSE)
623 #define CATEGORY_MASK_UTF_16 \
624 (CATEGORY_MASK_UTF_16_AUTO \
625 | CATEGORY_MASK_UTF_16_BE \
626 | CATEGORY_MASK_UTF_16_LE \
627 | CATEGORY_MASK_UTF_16_BE_NOSIG \
628 | CATEGORY_MASK_UTF_16_LE_NOSIG)
630 #define CATEGORY_MASK_UTF_8 \
631 (CATEGORY_MASK_UTF_8_AUTO \
632 | CATEGORY_MASK_UTF_8_NOSIG \
633 | CATEGORY_MASK_UTF_8_SIG)
635 /* Table of coding categories (Lisp symbols). This variable is for
636 internal use only. */
637 static Lisp_Object Vcoding_category_table;
639 /* Table of coding-categories ordered by priority. */
640 static enum coding_category coding_priorities[coding_category_max];
642 /* Nth element is a coding context for the coding system bound to the
643 Nth coding category. */
644 static struct coding_system coding_categories[coding_category_max];
646 /*** Commonly used macros and functions ***/
648 #ifndef min
649 #define min(a, b) ((a) < (b) ? (a) : (b))
650 #endif
651 #ifndef max
652 #define max(a, b) ((a) > (b) ? (a) : (b))
653 #endif
655 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
657 static int
658 encode_inhibit_flag (Lisp_Object flag)
660 return NILP (flag) ? -1 : EQ (flag, Qt);
663 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
664 1 means yes, -1 means no, 0 means ask the user variable VAR. */
666 static bool
667 inhibit_flag (int encoded_flag, bool var)
669 return 0 < encoded_flag + var;
672 #define CODING_GET_INFO(coding, attrs, charset_list) \
673 do { \
674 (attrs) = CODING_ID_ATTRS ((coding)->id); \
675 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
676 } while (0)
678 static void
679 CHECK_NATNUM_CAR (Lisp_Object x)
681 Lisp_Object tmp = XCAR (x);
682 CHECK_NATNUM (tmp);
683 XSETCAR (x, tmp);
686 static void
687 CHECK_NATNUM_CDR (Lisp_Object x)
689 Lisp_Object tmp = XCDR (x);
690 CHECK_NATNUM (tmp);
691 XSETCDR (x, tmp);
695 /* Safely get one byte from the source text pointed by SRC which ends
696 at SRC_END, and set C to that byte. If there are not enough bytes
697 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
698 and a multibyte character is found at SRC, set C to the
699 negative value of the character code. The caller should declare
700 and set these variables appropriately in advance:
701 src, src_end, multibytep */
703 #define ONE_MORE_BYTE(c) \
704 do { \
705 if (src == src_end) \
707 if (src_base < src) \
708 record_conversion_result \
709 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
710 goto no_more_source; \
712 c = *src++; \
713 if (multibytep && (c & 0x80)) \
715 if ((c & 0xFE) == 0xC0) \
716 c = ((c & 1) << 6) | *src++; \
717 else \
719 src--; \
720 c = - string_char (src, &src, NULL); \
721 record_conversion_result \
722 (coding, CODING_RESULT_INVALID_SRC); \
725 consumed_chars++; \
726 } while (0)
728 /* Safely get two bytes from the source text pointed by SRC which ends
729 at SRC_END, and set C1 and C2 to those bytes while skipping the
730 heading multibyte characters. If there are not enough bytes in the
731 source, it jumps to 'no_more_source'. If MULTIBYTEP and
732 a multibyte character is found for C2, set C2 to the negative value
733 of the character code. The caller should declare and set these
734 variables appropriately in advance:
735 src, src_end, multibytep
736 It is intended that this macro is used in detect_coding_utf_16. */
738 #define TWO_MORE_BYTES(c1, c2) \
739 do { \
740 do { \
741 if (src == src_end) \
742 goto no_more_source; \
743 c1 = *src++; \
744 if (multibytep && (c1 & 0x80)) \
746 if ((c1 & 0xFE) == 0xC0) \
747 c1 = ((c1 & 1) << 6) | *src++; \
748 else \
750 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
751 c1 = -1; \
754 } while (c1 < 0); \
755 if (src == src_end) \
756 goto no_more_source; \
757 c2 = *src++; \
758 if (multibytep && (c2 & 0x80)) \
760 if ((c2 & 0xFE) == 0xC0) \
761 c2 = ((c2 & 1) << 6) | *src++; \
762 else \
763 c2 = -1; \
765 } while (0)
768 /* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
775 #define EMIT_ONE_ASCII_BYTE(c) \
776 do { \
777 produced_chars++; \
778 *dst++ = (c); \
779 } while (0)
782 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
785 do { \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
788 } while (0)
791 /* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
793 store in an appropriate multibyte form. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
795 in advance. */
797 #define EMIT_ONE_BYTE(c) \
798 do { \
799 produced_chars++; \
800 if (multibytep) \
802 unsigned ch = (c); \
803 if (ch >= 0x80) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
807 else \
808 *dst++ = (c); \
809 } while (0)
812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
814 #define EMIT_TWO_BYTES(c1, c2) \
815 do { \
816 produced_chars += 2; \
817 if (multibytep) \
819 unsigned ch; \
821 ch = (c1); \
822 if (ch >= 0x80) \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
825 ch = (c2); \
826 if (ch >= 0x80) \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
830 else \
832 *dst++ = (c1); \
833 *dst++ = (c2); \
835 } while (0)
838 #define EMIT_THREE_BYTES(c1, c2, c3) \
839 do { \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
842 } while (0)
845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
846 do { \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
849 } while (0)
852 static void
853 record_conversion_result (struct coding_system *coding,
854 enum coding_result_code result)
856 coding->result = result;
857 switch (result)
859 case CODING_RESULT_INSUFFICIENT_SRC:
860 Vlast_code_conversion_error = Qinsufficient_source;
861 break;
862 case CODING_RESULT_INVALID_SRC:
863 Vlast_code_conversion_error = Qinvalid_source;
864 break;
865 case CODING_RESULT_INTERRUPT:
866 Vlast_code_conversion_error = Qinterrupted;
867 break;
868 case CODING_RESULT_INSUFFICIENT_DST:
869 /* Don't record this error in Vlast_code_conversion_error
870 because it happens just temporarily and is resolved when the
871 whole conversion is finished. */
872 break;
873 case CODING_RESULT_SUCCESS:
874 break;
875 default:
876 Vlast_code_conversion_error = intern ("Unknown error");
880 /* These wrapper macros are used to preserve validity of pointers into
881 buffer text across calls to decode_char, encode_char, etc, which
882 could cause relocation of buffers if it loads a charset map,
883 because loading a charset map allocates large structures. */
885 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
886 do { \
887 ptrdiff_t offset; \
889 charset_map_loaded = 0; \
890 c = DECODE_CHAR (charset, code); \
891 if (charset_map_loaded \
892 && (offset = coding_change_source (coding))) \
894 src += offset; \
895 src_base += offset; \
896 src_end += offset; \
898 } while (0)
900 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
901 do { \
902 ptrdiff_t offset; \
904 charset_map_loaded = 0; \
905 code = ENCODE_CHAR (charset, c); \
906 if (charset_map_loaded \
907 && (offset = coding_change_destination (coding))) \
909 dst += offset; \
910 dst_end += offset; \
912 } while (0)
914 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
915 do { \
916 ptrdiff_t offset; \
918 charset_map_loaded = 0; \
919 charset = char_charset (c, charset_list, code_return); \
920 if (charset_map_loaded \
921 && (offset = coding_change_destination (coding))) \
923 dst += offset; \
924 dst_end += offset; \
926 } while (0)
928 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
929 do { \
930 ptrdiff_t offset; \
932 charset_map_loaded = 0; \
933 result = CHAR_CHARSET_P (c, charset); \
934 if (charset_map_loaded \
935 && (offset = coding_change_destination (coding))) \
937 dst += offset; \
938 dst_end += offset; \
940 } while (0)
943 /* If there are at least BYTES length of room at dst, allocate memory
944 for coding->destination and update dst and dst_end. We don't have
945 to take care of coding->source which will be relocated. It is
946 handled by calling coding_set_source in encode_coding. */
948 #define ASSURE_DESTINATION(bytes) \
949 do { \
950 if (dst + (bytes) >= dst_end) \
952 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
954 dst = alloc_destination (coding, more_bytes, dst); \
955 dst_end = coding->destination + coding->dst_bytes; \
957 } while (0)
960 /* Store multibyte form of the character C in P, and advance P to the
961 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
962 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
963 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
965 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
967 /* Return the character code of character whose multibyte form is at
968 P, and advance P to the end of the multibyte form. This used to be
969 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
970 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
972 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
974 /* Set coding->source from coding->src_object. */
976 static void
977 coding_set_source (struct coding_system *coding)
979 if (BUFFERP (coding->src_object))
981 struct buffer *buf = XBUFFER (coding->src_object);
983 if (coding->src_pos < 0)
984 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
985 else
986 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
988 else if (STRINGP (coding->src_object))
990 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
992 else
994 /* Otherwise, the source is C string and is never relocated
995 automatically. Thus we don't have to update anything. */
1000 /* Set coding->source from coding->src_object, and return how many
1001 bytes coding->source was changed. */
1003 static ptrdiff_t
1004 coding_change_source (struct coding_system *coding)
1006 const unsigned char *orig = coding->source;
1007 coding_set_source (coding);
1008 return coding->source - orig;
1012 /* Set coding->destination from coding->dst_object. */
1014 static void
1015 coding_set_destination (struct coding_system *coding)
1017 if (BUFFERP (coding->dst_object))
1019 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1021 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1022 coding->dst_bytes = (GAP_END_ADDR
1023 - (coding->src_bytes - coding->consumed)
1024 - coding->destination);
1026 else
1028 /* We are sure that coding->dst_pos_byte is before the gap
1029 of the buffer. */
1030 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1031 + coding->dst_pos_byte - BEG_BYTE);
1032 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1033 - coding->destination);
1036 else
1038 /* Otherwise, the destination is C string and is never relocated
1039 automatically. Thus we don't have to update anything. */
1044 /* Set coding->destination from coding->dst_object, and return how
1045 many bytes coding->destination was changed. */
1047 static ptrdiff_t
1048 coding_change_destination (struct coding_system *coding)
1050 const unsigned char *orig = coding->destination;
1051 coding_set_destination (coding);
1052 return coding->destination - orig;
1056 static void
1057 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1059 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1060 string_overflow ();
1061 coding->destination = xrealloc (coding->destination,
1062 coding->dst_bytes + bytes);
1063 coding->dst_bytes += bytes;
1066 static void
1067 coding_alloc_by_making_gap (struct coding_system *coding,
1068 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1070 if (EQ (coding->src_object, coding->dst_object))
1072 /* The gap may contain the produced data at the head and not-yet
1073 consumed data at the tail. To preserve those data, we at
1074 first make the gap size to zero, then increase the gap
1075 size. */
1076 ptrdiff_t add = GAP_SIZE;
1078 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1079 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1080 make_gap (bytes);
1081 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1082 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1084 else
1085 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1089 static unsigned char *
1090 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1091 unsigned char *dst)
1093 ptrdiff_t offset = dst - coding->destination;
1095 if (BUFFERP (coding->dst_object))
1097 struct buffer *buf = XBUFFER (coding->dst_object);
1099 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1101 else
1102 coding_alloc_by_realloc (coding, nbytes);
1103 coding_set_destination (coding);
1104 dst = coding->destination + offset;
1105 return dst;
1108 /** Macros for annotations. */
1110 /* An annotation data is stored in the array coding->charbuf in this
1111 format:
1112 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1113 LENGTH is the number of elements in the annotation.
1114 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1115 NCHARS is the number of characters in the text annotated.
1117 The format of the following elements depend on ANNOTATION_MASK.
1119 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1120 follows:
1121 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1123 NBYTES is the number of bytes specified in the header part of
1124 old-style emacs-mule encoding, or 0 for the other kind of
1125 composition.
1127 METHOD is one of enum composition_method.
1129 Optional COMPOSITION-COMPONENTS are characters and composition
1130 rules.
1132 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1133 follows.
1135 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1136 recover from an invalid annotation, and should be skipped by
1137 produce_annotation. */
1139 /* Maximum length of the header of annotation data. */
1140 #define MAX_ANNOTATION_LENGTH 5
1142 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1143 do { \
1144 *(buf)++ = -(len); \
1145 *(buf)++ = (mask); \
1146 *(buf)++ = (nchars); \
1147 coding->annotated = 1; \
1148 } while (0);
1150 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1151 do { \
1152 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1153 *buf++ = nbytes; \
1154 *buf++ = method; \
1155 } while (0)
1158 #define ADD_CHARSET_DATA(buf, nchars, id) \
1159 do { \
1160 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1161 *buf++ = id; \
1162 } while (0)
1165 /* Bitmasks for coding->eol_seen. */
1167 #define EOL_SEEN_NONE 0
1168 #define EOL_SEEN_LF 1
1169 #define EOL_SEEN_CR 2
1170 #define EOL_SEEN_CRLF 4
1173 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1178 /*** 3. UTF-8 ***/
1180 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1181 Return true if a text is encoded in UTF-8. */
1183 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1184 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1185 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1186 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1187 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1188 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1190 #define UTF_8_BOM_1 0xEF
1191 #define UTF_8_BOM_2 0xBB
1192 #define UTF_8_BOM_3 0xBF
1194 /* Unlike the other detect_coding_XXX, this function counts the number
1195 of characters and checks the EOL format. */
1197 static bool
1198 detect_coding_utf_8 (struct coding_system *coding,
1199 struct coding_detection_info *detect_info)
1201 const unsigned char *src = coding->source, *src_base;
1202 const unsigned char *src_end = coding->source + coding->src_bytes;
1203 bool multibytep = coding->src_multibyte;
1204 ptrdiff_t consumed_chars = 0;
1205 bool bom_found = 0;
1206 ptrdiff_t nchars = coding->head_ascii;
1207 int eol_seen = coding->eol_seen;
1209 detect_info->checked |= CATEGORY_MASK_UTF_8;
1210 /* A coding system of this category is always ASCII compatible. */
1211 src += nchars;
1213 if (src == coding->source /* BOM should be at the head. */
1214 && src + 3 < src_end /* BOM is 3-byte long. */
1215 && src[0] == UTF_8_BOM_1
1216 && src[1] == UTF_8_BOM_2
1217 && src[2] == UTF_8_BOM_3)
1219 bom_found = 1;
1220 src += 3;
1221 nchars++;
1224 while (1)
1226 int c, c1, c2, c3, c4;
1228 src_base = src;
1229 ONE_MORE_BYTE (c);
1230 if (c < 0 || UTF_8_1_OCTET_P (c))
1232 nchars++;
1233 if (c == '\r')
1235 if (src < src_end && *src == '\n')
1237 eol_seen |= EOL_SEEN_CRLF;
1238 src++;
1239 nchars++;
1241 else
1242 eol_seen |= EOL_SEEN_CR;
1244 else if (c == '\n')
1245 eol_seen |= EOL_SEEN_LF;
1246 continue;
1248 ONE_MORE_BYTE (c1);
1249 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1250 break;
1251 if (UTF_8_2_OCTET_LEADING_P (c))
1253 nchars++;
1254 continue;
1256 ONE_MORE_BYTE (c2);
1257 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1258 break;
1259 if (UTF_8_3_OCTET_LEADING_P (c))
1261 nchars++;
1262 continue;
1264 ONE_MORE_BYTE (c3);
1265 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1266 break;
1267 if (UTF_8_4_OCTET_LEADING_P (c))
1269 nchars++;
1270 continue;
1272 ONE_MORE_BYTE (c4);
1273 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274 break;
1275 if (UTF_8_5_OCTET_LEADING_P (c))
1277 nchars++;
1278 continue;
1280 break;
1282 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1283 return 0;
1285 no_more_source:
1286 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1288 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1289 return 0;
1291 if (bom_found)
1293 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1294 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1296 else
1298 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1299 if (nchars < src_end - coding->source)
1300 /* The found characters are less than source bytes, which
1301 means that we found a valid non-ASCII characters. */
1302 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1304 coding->detected_utf8_bytes = src_base - coding->source;
1305 coding->detected_utf8_chars = nchars;
1306 return 1;
1310 static void
1311 decode_coding_utf_8 (struct coding_system *coding)
1313 const unsigned char *src = coding->source + coding->consumed;
1314 const unsigned char *src_end = coding->source + coding->src_bytes;
1315 const unsigned char *src_base;
1316 int *charbuf = coding->charbuf + coding->charbuf_used;
1317 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1318 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1319 bool multibytep = coding->src_multibyte;
1320 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1321 bool eol_dos
1322 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1323 int byte_after_cr = -1;
1325 if (bom != utf_without_bom)
1327 int c1, c2, c3;
1329 src_base = src;
1330 ONE_MORE_BYTE (c1);
1331 if (! UTF_8_3_OCTET_LEADING_P (c1))
1332 src = src_base;
1333 else
1335 ONE_MORE_BYTE (c2);
1336 if (! UTF_8_EXTRA_OCTET_P (c2))
1337 src = src_base;
1338 else
1340 ONE_MORE_BYTE (c3);
1341 if (! UTF_8_EXTRA_OCTET_P (c3))
1342 src = src_base;
1343 else
1345 if ((c1 != UTF_8_BOM_1)
1346 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1347 src = src_base;
1348 else
1349 CODING_UTF_8_BOM (coding) = utf_without_bom;
1354 CODING_UTF_8_BOM (coding) = utf_without_bom;
1356 while (1)
1358 int c, c1, c2, c3, c4, c5;
1360 src_base = src;
1361 consumed_chars_base = consumed_chars;
1363 if (charbuf >= charbuf_end)
1365 if (byte_after_cr >= 0)
1366 src_base--;
1367 break;
1370 /* In the simple case, rapidly handle ordinary characters */
1371 if (multibytep && ! eol_dos
1372 && charbuf < charbuf_end - 6 && src < src_end - 6)
1374 while (charbuf < charbuf_end - 6 && src < src_end - 6)
1376 c1 = *src;
1377 if (c1 & 0x80)
1378 break;
1379 src++;
1380 consumed_chars++;
1381 *charbuf++ = c1;
1383 c1 = *src;
1384 if (c1 & 0x80)
1385 break;
1386 src++;
1387 consumed_chars++;
1388 *charbuf++ = c1;
1390 c1 = *src;
1391 if (c1 & 0x80)
1392 break;
1393 src++;
1394 consumed_chars++;
1395 *charbuf++ = c1;
1397 c1 = *src;
1398 if (c1 & 0x80)
1399 break;
1400 src++;
1401 consumed_chars++;
1402 *charbuf++ = c1;
1404 /* If we handled at least one character, restart the main loop. */
1405 if (src != src_base)
1406 continue;
1409 if (byte_after_cr >= 0)
1410 c1 = byte_after_cr, byte_after_cr = -1;
1411 else
1412 ONE_MORE_BYTE (c1);
1413 if (c1 < 0)
1415 c = - c1;
1417 else if (UTF_8_1_OCTET_P (c1))
1419 if (eol_dos && c1 == '\r')
1420 ONE_MORE_BYTE (byte_after_cr);
1421 c = c1;
1423 else
1425 ONE_MORE_BYTE (c2);
1426 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1427 goto invalid_code;
1428 if (UTF_8_2_OCTET_LEADING_P (c1))
1430 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1431 /* Reject overlong sequences here and below. Encoders
1432 producing them are incorrect, they can be misleading,
1433 and they mess up read/write invariance. */
1434 if (c < 128)
1435 goto invalid_code;
1437 else
1439 ONE_MORE_BYTE (c3);
1440 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1441 goto invalid_code;
1442 if (UTF_8_3_OCTET_LEADING_P (c1))
1444 c = (((c1 & 0xF) << 12)
1445 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1446 if (c < 0x800
1447 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1448 goto invalid_code;
1450 else
1452 ONE_MORE_BYTE (c4);
1453 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1454 goto invalid_code;
1455 if (UTF_8_4_OCTET_LEADING_P (c1))
1457 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1458 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1459 if (c < 0x10000)
1460 goto invalid_code;
1462 else
1464 ONE_MORE_BYTE (c5);
1465 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1466 goto invalid_code;
1467 if (UTF_8_5_OCTET_LEADING_P (c1))
1469 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1470 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1471 | (c5 & 0x3F));
1472 if ((c > MAX_CHAR) || (c < 0x200000))
1473 goto invalid_code;
1475 else
1476 goto invalid_code;
1482 *charbuf++ = c;
1483 continue;
1485 invalid_code:
1486 src = src_base;
1487 consumed_chars = consumed_chars_base;
1488 ONE_MORE_BYTE (c);
1489 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1490 coding->errors++;
1493 no_more_source:
1494 coding->consumed_char += consumed_chars_base;
1495 coding->consumed = src_base - coding->source;
1496 coding->charbuf_used = charbuf - coding->charbuf;
1500 static bool
1501 encode_coding_utf_8 (struct coding_system *coding)
1503 bool multibytep = coding->dst_multibyte;
1504 int *charbuf = coding->charbuf;
1505 int *charbuf_end = charbuf + coding->charbuf_used;
1506 unsigned char *dst = coding->destination + coding->produced;
1507 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1508 ptrdiff_t produced_chars = 0;
1509 int c;
1511 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1513 ASSURE_DESTINATION (3);
1514 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1515 CODING_UTF_8_BOM (coding) = utf_without_bom;
1518 if (multibytep)
1520 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1522 while (charbuf < charbuf_end)
1524 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1526 ASSURE_DESTINATION (safe_room);
1527 c = *charbuf++;
1528 if (CHAR_BYTE8_P (c))
1530 c = CHAR_TO_BYTE8 (c);
1531 EMIT_ONE_BYTE (c);
1533 else
1535 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1536 for (p = str; p < pend; p++)
1537 EMIT_ONE_BYTE (*p);
1541 else
1543 int safe_room = MAX_MULTIBYTE_LENGTH;
1545 while (charbuf < charbuf_end)
1547 ASSURE_DESTINATION (safe_room);
1548 c = *charbuf++;
1549 if (CHAR_BYTE8_P (c))
1550 *dst++ = CHAR_TO_BYTE8 (c);
1551 else
1552 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1554 produced_chars = dst - (coding->destination + coding->produced);
1556 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1557 coding->produced_char += produced_chars;
1558 coding->produced = dst - coding->destination;
1559 return 0;
1563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1564 Return true if a text is encoded in one of UTF-16 based coding systems. */
1566 #define UTF_16_HIGH_SURROGATE_P(val) \
1567 (((val) & 0xFC00) == 0xD800)
1569 #define UTF_16_LOW_SURROGATE_P(val) \
1570 (((val) & 0xFC00) == 0xDC00)
1573 static bool
1574 detect_coding_utf_16 (struct coding_system *coding,
1575 struct coding_detection_info *detect_info)
1577 const unsigned char *src = coding->source;
1578 const unsigned char *src_end = coding->source + coding->src_bytes;
1579 bool multibytep = coding->src_multibyte;
1580 int c1, c2;
1582 detect_info->checked |= CATEGORY_MASK_UTF_16;
1583 if (coding->mode & CODING_MODE_LAST_BLOCK
1584 && (coding->src_chars & 1))
1586 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1587 return 0;
1590 TWO_MORE_BYTES (c1, c2);
1591 if ((c1 == 0xFF) && (c2 == 0xFE))
1593 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1594 | CATEGORY_MASK_UTF_16_AUTO);
1595 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1596 | CATEGORY_MASK_UTF_16_BE_NOSIG
1597 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1599 else if ((c1 == 0xFE) && (c2 == 0xFF))
1601 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1602 | CATEGORY_MASK_UTF_16_AUTO);
1603 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1604 | CATEGORY_MASK_UTF_16_BE_NOSIG
1605 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1607 else if (c2 < 0)
1609 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1610 return 0;
1612 else
1614 /* We check the dispersion of Eth and Oth bytes where E is even and
1615 O is odd. If both are high, we assume binary data.*/
1616 unsigned char e[256], o[256];
1617 unsigned e_num = 1, o_num = 1;
1619 memset (e, 0, 256);
1620 memset (o, 0, 256);
1621 e[c1] = 1;
1622 o[c2] = 1;
1624 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1625 |CATEGORY_MASK_UTF_16_BE
1626 | CATEGORY_MASK_UTF_16_LE);
1628 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1629 != CATEGORY_MASK_UTF_16)
1631 TWO_MORE_BYTES (c1, c2);
1632 if (c2 < 0)
1633 break;
1634 if (! e[c1])
1636 e[c1] = 1;
1637 e_num++;
1638 if (e_num >= 128)
1639 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1641 if (! o[c2])
1643 o[c2] = 1;
1644 o_num++;
1645 if (o_num >= 128)
1646 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1649 return 0;
1652 no_more_source:
1653 return 1;
1656 static void
1657 decode_coding_utf_16 (struct coding_system *coding)
1659 const unsigned char *src = coding->source + coding->consumed;
1660 const unsigned char *src_end = coding->source + coding->src_bytes;
1661 const unsigned char *src_base;
1662 int *charbuf = coding->charbuf + coding->charbuf_used;
1663 /* We may produces at most 3 chars in one loop. */
1664 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1665 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1666 bool multibytep = coding->src_multibyte;
1667 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1668 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1669 int surrogate = CODING_UTF_16_SURROGATE (coding);
1670 bool eol_dos
1671 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1672 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1674 if (bom == utf_with_bom)
1676 int c, c1, c2;
1678 src_base = src;
1679 ONE_MORE_BYTE (c1);
1680 ONE_MORE_BYTE (c2);
1681 c = (c1 << 8) | c2;
1683 if (endian == utf_16_big_endian
1684 ? c != 0xFEFF : c != 0xFFFE)
1686 /* The first two bytes are not BOM. Treat them as bytes
1687 for a normal character. */
1688 src = src_base;
1689 coding->errors++;
1691 CODING_UTF_16_BOM (coding) = utf_without_bom;
1693 else if (bom == utf_detect_bom)
1695 /* We have already tried to detect BOM and failed in
1696 detect_coding. */
1697 CODING_UTF_16_BOM (coding) = utf_without_bom;
1700 while (1)
1702 int c, c1, c2;
1704 src_base = src;
1705 consumed_chars_base = consumed_chars;
1707 if (charbuf >= charbuf_end)
1709 if (byte_after_cr1 >= 0)
1710 src_base -= 2;
1711 break;
1714 if (byte_after_cr1 >= 0)
1715 c1 = byte_after_cr1, byte_after_cr1 = -1;
1716 else
1717 ONE_MORE_BYTE (c1);
1718 if (c1 < 0)
1720 *charbuf++ = -c1;
1721 continue;
1723 if (byte_after_cr2 >= 0)
1724 c2 = byte_after_cr2, byte_after_cr2 = -1;
1725 else
1726 ONE_MORE_BYTE (c2);
1727 if (c2 < 0)
1729 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1730 *charbuf++ = -c2;
1731 continue;
1733 c = (endian == utf_16_big_endian
1734 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1736 if (surrogate)
1738 if (! UTF_16_LOW_SURROGATE_P (c))
1740 if (endian == utf_16_big_endian)
1741 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1742 else
1743 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1744 *charbuf++ = c1;
1745 *charbuf++ = c2;
1746 coding->errors++;
1747 if (UTF_16_HIGH_SURROGATE_P (c))
1748 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1749 else
1750 *charbuf++ = c;
1752 else
1754 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1755 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1756 *charbuf++ = 0x10000 + c;
1759 else
1761 if (UTF_16_HIGH_SURROGATE_P (c))
1762 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1763 else
1765 if (eol_dos && c == '\r')
1767 ONE_MORE_BYTE (byte_after_cr1);
1768 ONE_MORE_BYTE (byte_after_cr2);
1770 *charbuf++ = c;
1775 no_more_source:
1776 coding->consumed_char += consumed_chars_base;
1777 coding->consumed = src_base - coding->source;
1778 coding->charbuf_used = charbuf - coding->charbuf;
1781 static bool
1782 encode_coding_utf_16 (struct coding_system *coding)
1784 bool multibytep = coding->dst_multibyte;
1785 int *charbuf = coding->charbuf;
1786 int *charbuf_end = charbuf + coding->charbuf_used;
1787 unsigned char *dst = coding->destination + coding->produced;
1788 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1789 int safe_room = 8;
1790 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1791 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1792 ptrdiff_t produced_chars = 0;
1793 int c;
1795 if (bom != utf_without_bom)
1797 ASSURE_DESTINATION (safe_room);
1798 if (big_endian)
1799 EMIT_TWO_BYTES (0xFE, 0xFF);
1800 else
1801 EMIT_TWO_BYTES (0xFF, 0xFE);
1802 CODING_UTF_16_BOM (coding) = utf_without_bom;
1805 while (charbuf < charbuf_end)
1807 ASSURE_DESTINATION (safe_room);
1808 c = *charbuf++;
1809 if (c > MAX_UNICODE_CHAR)
1810 c = coding->default_char;
1812 if (c < 0x10000)
1814 if (big_endian)
1815 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1816 else
1817 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1819 else
1821 int c1, c2;
1823 c -= 0x10000;
1824 c1 = (c >> 10) + 0xD800;
1825 c2 = (c & 0x3FF) + 0xDC00;
1826 if (big_endian)
1827 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1828 else
1829 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1832 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1833 coding->produced = dst - coding->destination;
1834 coding->produced_char += produced_chars;
1835 return 0;
1839 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1841 /* Emacs' internal format for representation of multiple character
1842 sets is a kind of multi-byte encoding, i.e. characters are
1843 represented by variable-length sequences of one-byte codes.
1845 ASCII characters and control characters (e.g. `tab', `newline') are
1846 represented by one-byte sequences which are their ASCII codes, in
1847 the range 0x00 through 0x7F.
1849 8-bit characters of the range 0x80..0x9F are represented by
1850 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1851 code + 0x20).
1853 8-bit characters of the range 0xA0..0xFF are represented by
1854 one-byte sequences which are their 8-bit code.
1856 The other characters are represented by a sequence of `base
1857 leading-code', optional `extended leading-code', and one or two
1858 `position-code's. The length of the sequence is determined by the
1859 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1860 whereas extended leading-code and position-code take the range 0xA0
1861 through 0xFF. See `charset.h' for more details about leading-code
1862 and position-code.
1864 --- CODE RANGE of Emacs' internal format ---
1865 character set range
1866 ------------- -----
1867 ascii 0x00..0x7F
1868 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1869 eight-bit-graphic 0xA0..0xBF
1870 ELSE 0x81..0x9D + [0xA0..0xFF]+
1871 ---------------------------------------------
1873 As this is the internal character representation, the format is
1874 usually not used externally (i.e. in a file or in a data sent to a
1875 process). But, it is possible to have a text externally in this
1876 format (i.e. by encoding by the coding system `emacs-mule').
1878 In that case, a sequence of one-byte codes has a slightly different
1879 form.
1881 At first, all characters in eight-bit-control are represented by
1882 one-byte sequences which are their 8-bit code.
1884 Next, character composition data are represented by the byte
1885 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1886 where,
1887 METHOD is 0xF2 plus one of composition method (enum
1888 composition_method),
1890 BYTES is 0xA0 plus a byte length of this composition data,
1892 CHARS is 0xA0 plus a number of characters composed by this
1893 data,
1895 COMPONENTs are characters of multibyte form or composition
1896 rules encoded by two-byte of ASCII codes.
1898 In addition, for backward compatibility, the following formats are
1899 also recognized as composition data on decoding.
1901 0x80 MSEQ ...
1902 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1904 Here,
1905 MSEQ is a multibyte form but in these special format:
1906 ASCII: 0xA0 ASCII_CODE+0x80,
1907 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1908 RULE is a one byte code of the range 0xA0..0xF0 that
1909 represents a composition rule.
1912 char emacs_mule_bytes[256];
1915 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1916 Return true if a text is encoded in 'emacs-mule'. */
1918 static bool
1919 detect_coding_emacs_mule (struct coding_system *coding,
1920 struct coding_detection_info *detect_info)
1922 const unsigned char *src = coding->source, *src_base;
1923 const unsigned char *src_end = coding->source + coding->src_bytes;
1924 bool multibytep = coding->src_multibyte;
1925 ptrdiff_t consumed_chars = 0;
1926 int c;
1927 int found = 0;
1929 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1930 /* A coding system of this category is always ASCII compatible. */
1931 src += coding->head_ascii;
1933 while (1)
1935 src_base = src;
1936 ONE_MORE_BYTE (c);
1937 if (c < 0)
1938 continue;
1939 if (c == 0x80)
1941 /* Perhaps the start of composite character. We simply skip
1942 it because analyzing it is too heavy for detecting. But,
1943 at least, we check that the composite character
1944 constitutes of more than 4 bytes. */
1945 const unsigned char *src_start;
1947 repeat:
1948 src_start = src;
1951 ONE_MORE_BYTE (c);
1953 while (c >= 0xA0);
1955 if (src - src_start <= 4)
1956 break;
1957 found = CATEGORY_MASK_EMACS_MULE;
1958 if (c == 0x80)
1959 goto repeat;
1962 if (c < 0x80)
1964 if (c < 0x20
1965 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1966 break;
1968 else
1970 int more_bytes = emacs_mule_bytes[c] - 1;
1972 while (more_bytes > 0)
1974 ONE_MORE_BYTE (c);
1975 if (c < 0xA0)
1977 src--; /* Unread the last byte. */
1978 break;
1980 more_bytes--;
1982 if (more_bytes != 0)
1983 break;
1984 found = CATEGORY_MASK_EMACS_MULE;
1987 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1988 return 0;
1990 no_more_source:
1991 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1993 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1994 return 0;
1996 detect_info->found |= found;
1997 return 1;
2001 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2002 character. If CMP_STATUS indicates that we must expect MSEQ or
2003 RULE described above, decode it and return the negative value of
2004 the decoded character or rule. If an invalid byte is found, return
2005 -1. If SRC is too short, return -2. */
2007 static int
2008 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2009 int *nbytes, int *nchars, int *id,
2010 struct composition_status *cmp_status)
2012 const unsigned char *src_end = coding->source + coding->src_bytes;
2013 const unsigned char *src_base = src;
2014 bool multibytep = coding->src_multibyte;
2015 int charset_ID;
2016 unsigned code;
2017 int c;
2018 ptrdiff_t consumed_chars = 0;
2019 bool mseq_found = 0;
2021 ONE_MORE_BYTE (c);
2022 if (c < 0)
2024 c = -c;
2025 charset_ID = emacs_mule_charset[0];
2027 else
2029 if (c >= 0xA0)
2031 if (cmp_status->state != COMPOSING_NO
2032 && cmp_status->old_form)
2034 if (cmp_status->state == COMPOSING_CHAR)
2036 if (c == 0xA0)
2038 ONE_MORE_BYTE (c);
2039 c -= 0x80;
2040 if (c < 0)
2041 goto invalid_code;
2043 else
2044 c -= 0x20;
2045 mseq_found = 1;
2047 else
2049 *nbytes = src - src_base;
2050 *nchars = consumed_chars;
2051 return -c;
2054 else
2055 goto invalid_code;
2058 switch (emacs_mule_bytes[c])
2060 case 2:
2061 if ((charset_ID = emacs_mule_charset[c]) < 0)
2062 goto invalid_code;
2063 ONE_MORE_BYTE (c);
2064 if (c < 0xA0)
2065 goto invalid_code;
2066 code = c & 0x7F;
2067 break;
2069 case 3:
2070 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2071 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2073 ONE_MORE_BYTE (c);
2074 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2075 goto invalid_code;
2076 ONE_MORE_BYTE (c);
2077 if (c < 0xA0)
2078 goto invalid_code;
2079 code = c & 0x7F;
2081 else
2083 if ((charset_ID = emacs_mule_charset[c]) < 0)
2084 goto invalid_code;
2085 ONE_MORE_BYTE (c);
2086 if (c < 0xA0)
2087 goto invalid_code;
2088 code = (c & 0x7F) << 8;
2089 ONE_MORE_BYTE (c);
2090 if (c < 0xA0)
2091 goto invalid_code;
2092 code |= c & 0x7F;
2094 break;
2096 case 4:
2097 ONE_MORE_BYTE (c);
2098 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2099 goto invalid_code;
2100 ONE_MORE_BYTE (c);
2101 if (c < 0xA0)
2102 goto invalid_code;
2103 code = (c & 0x7F) << 8;
2104 ONE_MORE_BYTE (c);
2105 if (c < 0xA0)
2106 goto invalid_code;
2107 code |= c & 0x7F;
2108 break;
2110 case 1:
2111 code = c;
2112 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2113 break;
2115 default:
2116 emacs_abort ();
2118 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2119 CHARSET_FROM_ID (charset_ID), code, c);
2120 if (c < 0)
2121 goto invalid_code;
2123 *nbytes = src - src_base;
2124 *nchars = consumed_chars;
2125 if (id)
2126 *id = charset_ID;
2127 return (mseq_found ? -c : c);
2129 no_more_source:
2130 return -2;
2132 invalid_code:
2133 return -1;
2137 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2139 /* Handle these composition sequence ('|': the end of header elements,
2140 BYTES and CHARS >= 0xA0):
2142 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2143 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2144 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2146 and these old form:
2148 (4) relative composition: 0x80 | MSEQ ... MSEQ
2149 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2151 When the starter 0x80 and the following header elements are found,
2152 this annotation header is produced.
2154 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2156 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2159 Then, upon reading the following elements, these codes are produced
2160 until the composition end is found:
2162 (1) CHAR ... CHAR
2163 (2) ALT ... ALT CHAR ... CHAR
2164 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2165 (4) CHAR ... CHAR
2166 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2168 When the composition end is found, LENGTH and NCHARS in the
2169 annotation header is updated as below:
2171 (1) LENGTH: unchanged, NCHARS: unchanged
2172 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2174 (4) LENGTH: unchanged, NCHARS: number of CHARs
2175 (5) LENGTH: unchanged, NCHARS: number of CHARs
2177 If an error is found while composing, the annotation header is
2178 changed to the original composition header (plus filler -1s) as
2179 below:
2181 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2182 (5) [ 0x80 0xFF -1 -1- -1 ]
2184 and the sequence [ -2 DECODED-RULE ] is changed to the original
2185 byte sequence as below:
2186 o the original byte sequence is B: [ B -1 ]
2187 o the original byte sequence is B1 B2: [ B1 B2 ]
2189 Most of the routines are implemented by macros because many
2190 variables and labels in the caller decode_coding_emacs_mule must be
2191 accessible, and they are usually called just once (thus doesn't
2192 increase the size of compiled object). */
2194 /* Decode a composition rule represented by C as a component of
2195 composition sequence of Emacs 20 style. Set RULE to the decoded
2196 rule. */
2198 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2199 do { \
2200 int gref, nref; \
2202 c -= 0xA0; \
2203 if (c < 0 || c >= 81) \
2204 goto invalid_code; \
2205 gref = c / 9, nref = c % 9; \
2206 if (gref == 4) gref = 10; \
2207 if (nref == 4) nref = 10; \
2208 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2209 } while (0)
2212 /* Decode a composition rule represented by C and the following byte
2213 at SRC as a component of composition sequence of Emacs 21 style.
2214 Set RULE to the decoded rule. */
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2217 do { \
2218 int gref, nref; \
2220 gref = c - 0x20; \
2221 if (gref < 0 || gref >= 81) \
2222 goto invalid_code; \
2223 ONE_MORE_BYTE (c); \
2224 nref = c - 0x20; \
2225 if (nref < 0 || nref >= 81) \
2226 goto invalid_code; \
2227 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2228 } while (0)
2231 /* Start of Emacs 21 style format. The first three bytes at SRC are
2232 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2233 byte length of this composition information, CHARS is the number of
2234 characters composed by this composition. */
2236 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2237 do { \
2238 enum composition_method method = c - 0xF2; \
2239 int nbytes, nchars; \
2241 ONE_MORE_BYTE (c); \
2242 if (c < 0) \
2243 goto invalid_code; \
2244 nbytes = c - 0xA0; \
2245 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2246 goto invalid_code; \
2247 ONE_MORE_BYTE (c); \
2248 nchars = c - 0xA0; \
2249 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2250 goto invalid_code; \
2251 cmp_status->old_form = 0; \
2252 cmp_status->method = method; \
2253 if (method == COMPOSITION_RELATIVE) \
2254 cmp_status->state = COMPOSING_CHAR; \
2255 else \
2256 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2257 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2258 cmp_status->nchars = nchars; \
2259 cmp_status->ncomps = nbytes - 4; \
2260 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2261 } while (0)
2264 /* Start of Emacs 20 style format for relative composition. */
2266 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2267 do { \
2268 cmp_status->old_form = 1; \
2269 cmp_status->method = COMPOSITION_RELATIVE; \
2270 cmp_status->state = COMPOSING_CHAR; \
2271 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2272 cmp_status->nchars = cmp_status->ncomps = 0; \
2273 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2274 } while (0)
2277 /* Start of Emacs 20 style format for rule-base composition. */
2279 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2280 do { \
2281 cmp_status->old_form = 1; \
2282 cmp_status->method = COMPOSITION_WITH_RULE; \
2283 cmp_status->state = COMPOSING_CHAR; \
2284 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2285 cmp_status->nchars = cmp_status->ncomps = 0; \
2286 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2287 } while (0)
2290 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2291 do { \
2292 const unsigned char *current_src = src; \
2294 ONE_MORE_BYTE (c); \
2295 if (c < 0) \
2296 goto invalid_code; \
2297 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2298 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2299 DECODE_EMACS_MULE_21_COMPOSITION (); \
2300 else if (c < 0xA0) \
2301 goto invalid_code; \
2302 else if (c < 0xC0) \
2304 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2305 /* Re-read C as a composition component. */ \
2306 src = current_src; \
2308 else if (c == 0xFF) \
2309 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2310 else \
2311 goto invalid_code; \
2312 } while (0)
2314 #define EMACS_MULE_COMPOSITION_END() \
2315 do { \
2316 int idx = - cmp_status->length; \
2318 if (cmp_status->old_form) \
2319 charbuf[idx + 2] = cmp_status->nchars; \
2320 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2321 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2322 cmp_status->state = COMPOSING_NO; \
2323 } while (0)
2326 static int
2327 emacs_mule_finish_composition (int *charbuf,
2328 struct composition_status *cmp_status)
2330 int idx = - cmp_status->length;
2331 int new_chars;
2333 if (cmp_status->old_form && cmp_status->nchars > 0)
2335 charbuf[idx + 2] = cmp_status->nchars;
2336 new_chars = 0;
2337 if (cmp_status->method == COMPOSITION_WITH_RULE
2338 && cmp_status->state == COMPOSING_CHAR)
2340 /* The last rule was invalid. */
2341 int rule = charbuf[-1] + 0xA0;
2343 charbuf[-2] = BYTE8_TO_CHAR (rule);
2344 charbuf[-1] = -1;
2345 new_chars = 1;
2348 else
2350 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2352 if (cmp_status->method == COMPOSITION_WITH_RULE)
2354 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2355 charbuf[idx++] = -3;
2356 charbuf[idx++] = 0;
2357 new_chars = 1;
2359 else
2361 int nchars = charbuf[idx + 1] + 0xA0;
2362 int nbytes = charbuf[idx + 2] + 0xA0;
2364 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2365 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2366 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2367 charbuf[idx++] = -1;
2368 new_chars = 4;
2371 cmp_status->state = COMPOSING_NO;
2372 return new_chars;
2375 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2376 do { \
2377 if (cmp_status->state != COMPOSING_NO) \
2378 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2379 } while (0)
2382 static void
2383 decode_coding_emacs_mule (struct coding_system *coding)
2385 const unsigned char *src = coding->source + coding->consumed;
2386 const unsigned char *src_end = coding->source + coding->src_bytes;
2387 const unsigned char *src_base;
2388 int *charbuf = coding->charbuf + coding->charbuf_used;
2389 /* We may produce two annotations (charset and composition) in one
2390 loop and one more charset annotation at the end. */
2391 int *charbuf_end
2392 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2393 /* We can produce up to 2 characters in a loop. */
2394 - 1;
2395 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2396 bool multibytep = coding->src_multibyte;
2397 ptrdiff_t char_offset = coding->produced_char;
2398 ptrdiff_t last_offset = char_offset;
2399 int last_id = charset_ascii;
2400 bool eol_dos
2401 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2402 int byte_after_cr = -1;
2403 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2405 if (cmp_status->state != COMPOSING_NO)
2407 int i;
2409 if (charbuf_end - charbuf < cmp_status->length)
2410 emacs_abort ();
2411 for (i = 0; i < cmp_status->length; i++)
2412 *charbuf++ = cmp_status->carryover[i];
2413 coding->annotated = 1;
2416 while (1)
2418 int c, id IF_LINT (= 0);
2420 src_base = src;
2421 consumed_chars_base = consumed_chars;
2423 if (charbuf >= charbuf_end)
2425 if (byte_after_cr >= 0)
2426 src_base--;
2427 break;
2430 if (byte_after_cr >= 0)
2431 c = byte_after_cr, byte_after_cr = -1;
2432 else
2433 ONE_MORE_BYTE (c);
2435 if (c < 0 || c == 0x80)
2437 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2438 if (c < 0)
2440 *charbuf++ = -c;
2441 char_offset++;
2443 else
2444 DECODE_EMACS_MULE_COMPOSITION_START ();
2445 continue;
2448 if (c < 0x80)
2450 if (eol_dos && c == '\r')
2451 ONE_MORE_BYTE (byte_after_cr);
2452 id = charset_ascii;
2453 if (cmp_status->state != COMPOSING_NO)
2455 if (cmp_status->old_form)
2456 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2457 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2458 cmp_status->ncomps--;
2461 else
2463 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2464 /* emacs_mule_char can load a charset map from a file, which
2465 allocates a large structure and might cause buffer text
2466 to be relocated as result. Thus, we need to remember the
2467 original pointer to buffer text, and fix up all related
2468 pointers after the call. */
2469 const unsigned char *orig = coding->source;
2470 ptrdiff_t offset;
2472 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2473 cmp_status);
2474 offset = coding->source - orig;
2475 if (offset)
2477 src += offset;
2478 src_base += offset;
2479 src_end += offset;
2481 if (c < 0)
2483 if (c == -1)
2484 goto invalid_code;
2485 if (c == -2)
2486 break;
2488 src = src_base + nbytes;
2489 consumed_chars = consumed_chars_base + nchars;
2490 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2491 cmp_status->ncomps -= nchars;
2494 /* Now if C >= 0, we found a normally encoded character, if C <
2495 0, we found an old-style composition component character or
2496 rule. */
2498 if (cmp_status->state == COMPOSING_NO)
2500 if (last_id != id)
2502 if (last_id != charset_ascii)
2503 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2504 last_id);
2505 last_id = id;
2506 last_offset = char_offset;
2508 *charbuf++ = c;
2509 char_offset++;
2511 else if (cmp_status->state == COMPOSING_CHAR)
2513 if (cmp_status->old_form)
2515 if (c >= 0)
2517 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2518 *charbuf++ = c;
2519 char_offset++;
2521 else
2523 *charbuf++ = -c;
2524 cmp_status->nchars++;
2525 cmp_status->length++;
2526 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2527 EMACS_MULE_COMPOSITION_END ();
2528 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2529 cmp_status->state = COMPOSING_RULE;
2532 else
2534 *charbuf++ = c;
2535 cmp_status->length++;
2536 cmp_status->nchars--;
2537 if (cmp_status->nchars == 0)
2538 EMACS_MULE_COMPOSITION_END ();
2541 else if (cmp_status->state == COMPOSING_RULE)
2543 int rule;
2545 if (c >= 0)
2547 EMACS_MULE_COMPOSITION_END ();
2548 *charbuf++ = c;
2549 char_offset++;
2551 else
2553 c = -c;
2554 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2555 if (rule < 0)
2556 goto invalid_code;
2557 *charbuf++ = -2;
2558 *charbuf++ = rule;
2559 cmp_status->length += 2;
2560 cmp_status->state = COMPOSING_CHAR;
2563 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2565 *charbuf++ = c;
2566 cmp_status->length++;
2567 if (cmp_status->ncomps == 0)
2568 cmp_status->state = COMPOSING_CHAR;
2569 else if (cmp_status->ncomps > 0)
2571 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2572 cmp_status->state = COMPOSING_COMPONENT_RULE;
2574 else
2575 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2577 else /* COMPOSING_COMPONENT_RULE */
2579 int rule;
2581 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2582 if (rule < 0)
2583 goto invalid_code;
2584 *charbuf++ = -2;
2585 *charbuf++ = rule;
2586 cmp_status->length += 2;
2587 cmp_status->ncomps--;
2588 if (cmp_status->ncomps > 0)
2589 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2590 else
2591 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2593 continue;
2595 invalid_code:
2596 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2597 src = src_base;
2598 consumed_chars = consumed_chars_base;
2599 ONE_MORE_BYTE (c);
2600 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2601 char_offset++;
2602 coding->errors++;
2605 no_more_source:
2606 if (cmp_status->state != COMPOSING_NO)
2608 if (coding->mode & CODING_MODE_LAST_BLOCK)
2609 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610 else
2612 int i;
2614 charbuf -= cmp_status->length;
2615 for (i = 0; i < cmp_status->length; i++)
2616 cmp_status->carryover[i] = charbuf[i];
2619 if (last_id != charset_ascii)
2620 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2621 coding->consumed_char += consumed_chars_base;
2622 coding->consumed = src_base - coding->source;
2623 coding->charbuf_used = charbuf - coding->charbuf;
2627 #define EMACS_MULE_LEADING_CODES(id, codes) \
2628 do { \
2629 if (id < 0xA0) \
2630 codes[0] = id, codes[1] = 0; \
2631 else if (id < 0xE0) \
2632 codes[0] = 0x9A, codes[1] = id; \
2633 else if (id < 0xF0) \
2634 codes[0] = 0x9B, codes[1] = id; \
2635 else if (id < 0xF5) \
2636 codes[0] = 0x9C, codes[1] = id; \
2637 else \
2638 codes[0] = 0x9D, codes[1] = id; \
2639 } while (0);
2642 static bool
2643 encode_coding_emacs_mule (struct coding_system *coding)
2645 bool multibytep = coding->dst_multibyte;
2646 int *charbuf = coding->charbuf;
2647 int *charbuf_end = charbuf + coding->charbuf_used;
2648 unsigned char *dst = coding->destination + coding->produced;
2649 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2650 int safe_room = 8;
2651 ptrdiff_t produced_chars = 0;
2652 Lisp_Object attrs, charset_list;
2653 int c;
2654 int preferred_charset_id = -1;
2656 CODING_GET_INFO (coding, attrs, charset_list);
2657 if (! EQ (charset_list, Vemacs_mule_charset_list))
2659 charset_list = Vemacs_mule_charset_list;
2660 ASET (attrs, coding_attr_charset_list, charset_list);
2663 while (charbuf < charbuf_end)
2665 ASSURE_DESTINATION (safe_room);
2666 c = *charbuf++;
2668 if (c < 0)
2670 /* Handle an annotation. */
2671 switch (*charbuf)
2673 case CODING_ANNOTATE_COMPOSITION_MASK:
2674 /* Not yet implemented. */
2675 break;
2676 case CODING_ANNOTATE_CHARSET_MASK:
2677 preferred_charset_id = charbuf[3];
2678 if (preferred_charset_id >= 0
2679 && NILP (Fmemq (make_number (preferred_charset_id),
2680 charset_list)))
2681 preferred_charset_id = -1;
2682 break;
2683 default:
2684 emacs_abort ();
2686 charbuf += -c - 1;
2687 continue;
2690 if (ASCII_CHAR_P (c))
2691 EMIT_ONE_ASCII_BYTE (c);
2692 else if (CHAR_BYTE8_P (c))
2694 c = CHAR_TO_BYTE8 (c);
2695 EMIT_ONE_BYTE (c);
2697 else
2699 struct charset *charset;
2700 unsigned code;
2701 int dimension;
2702 int emacs_mule_id;
2703 unsigned char leading_codes[2];
2705 if (preferred_charset_id >= 0)
2707 bool result;
2709 charset = CHARSET_FROM_ID (preferred_charset_id);
2710 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2711 if (result)
2712 code = ENCODE_CHAR (charset, c);
2713 else
2714 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2715 &code, charset);
2717 else
2718 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2719 &code, charset);
2720 if (! charset)
2722 c = coding->default_char;
2723 if (ASCII_CHAR_P (c))
2725 EMIT_ONE_ASCII_BYTE (c);
2726 continue;
2728 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2729 &code, charset);
2731 dimension = CHARSET_DIMENSION (charset);
2732 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2733 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2734 EMIT_ONE_BYTE (leading_codes[0]);
2735 if (leading_codes[1])
2736 EMIT_ONE_BYTE (leading_codes[1]);
2737 if (dimension == 1)
2738 EMIT_ONE_BYTE (code | 0x80);
2739 else
2741 code |= 0x8080;
2742 EMIT_ONE_BYTE (code >> 8);
2743 EMIT_ONE_BYTE (code & 0xFF);
2747 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2748 coding->produced_char += produced_chars;
2749 coding->produced = dst - coding->destination;
2750 return 0;
2754 /*** 7. ISO2022 handlers ***/
2756 /* The following note describes the coding system ISO2022 briefly.
2757 Since the intention of this note is to help understand the
2758 functions in this file, some parts are NOT ACCURATE or are OVERLY
2759 SIMPLIFIED. For thorough understanding, please refer to the
2760 original document of ISO2022. This is equivalent to the standard
2761 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2763 ISO2022 provides many mechanisms to encode several character sets
2764 in 7-bit and 8-bit environments. For 7-bit environments, all text
2765 is encoded using bytes less than 128. This may make the encoded
2766 text a little bit longer, but the text passes more easily through
2767 several types of gateway, some of which strip off the MSB (Most
2768 Significant Bit).
2770 There are two kinds of character sets: control character sets and
2771 graphic character sets. The former contain control characters such
2772 as `newline' and `escape' to provide control functions (control
2773 functions are also provided by escape sequences). The latter
2774 contain graphic characters such as 'A' and '-'. Emacs recognizes
2775 two control character sets and many graphic character sets.
2777 Graphic character sets are classified into one of the following
2778 four classes, according to the number of bytes (DIMENSION) and
2779 number of characters in one dimension (CHARS) of the set:
2780 - DIMENSION1_CHARS94
2781 - DIMENSION1_CHARS96
2782 - DIMENSION2_CHARS94
2783 - DIMENSION2_CHARS96
2785 In addition, each character set is assigned an identification tag,
2786 unique for each set, called the "final character" (denoted as <F>
2787 hereafter). The <F> of each character set is decided by ECMA(*)
2788 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2789 (0x30..0x3F are for private use only).
2791 Note (*): ECMA = European Computer Manufacturers Association
2793 Here are examples of graphic character sets [NAME(<F>)]:
2794 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2795 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2796 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2797 o DIMENSION2_CHARS96 -- none for the moment
2799 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2800 C0 [0x00..0x1F] -- control character plane 0
2801 GL [0x20..0x7F] -- graphic character plane 0
2802 C1 [0x80..0x9F] -- control character plane 1
2803 GR [0xA0..0xFF] -- graphic character plane 1
2805 A control character set is directly designated and invoked to C0 or
2806 C1 by an escape sequence. The most common case is that:
2807 - ISO646's control character set is designated/invoked to C0, and
2808 - ISO6429's control character set is designated/invoked to C1,
2809 and usually these designations/invocations are omitted in encoded
2810 text. In a 7-bit environment, only C0 can be used, and a control
2811 character for C1 is encoded by an appropriate escape sequence to
2812 fit into the environment. All control characters for C1 are
2813 defined to have corresponding escape sequences.
2815 A graphic character set is at first designated to one of four
2816 graphic registers (G0 through G3), then these graphic registers are
2817 invoked to GL or GR. These designations and invocations can be
2818 done independently. The most common case is that G0 is invoked to
2819 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2820 these invocations and designations are omitted in encoded text.
2821 In a 7-bit environment, only GL can be used.
2823 When a graphic character set of CHARS94 is invoked to GL, codes
2824 0x20 and 0x7F of the GL area work as control characters SPACE and
2825 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2826 be used.
2828 There are two ways of invocation: locking-shift and single-shift.
2829 With locking-shift, the invocation lasts until the next different
2830 invocation, whereas with single-shift, the invocation affects the
2831 following character only and doesn't affect the locking-shift
2832 state. Invocations are done by the following control characters or
2833 escape sequences:
2835 ----------------------------------------------------------------------
2836 abbrev function cntrl escape seq description
2837 ----------------------------------------------------------------------
2838 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2839 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2840 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2841 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2842 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2843 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2844 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2845 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2846 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2847 ----------------------------------------------------------------------
2848 (*) These are not used by any known coding system.
2850 Control characters for these functions are defined by macros
2851 ISO_CODE_XXX in `coding.h'.
2853 Designations are done by the following escape sequences:
2854 ----------------------------------------------------------------------
2855 escape sequence description
2856 ----------------------------------------------------------------------
2857 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2858 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2859 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2860 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2861 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2862 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2863 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2864 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2865 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2866 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2867 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2868 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2869 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2870 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2871 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2872 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2873 ----------------------------------------------------------------------
2875 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2876 of dimension 1, chars 94, and final character <F>, etc...
2878 Note (*): Although these designations are not allowed in ISO2022,
2879 Emacs accepts them on decoding, and produces them on encoding
2880 CHARS96 character sets in a coding system which is characterized as
2881 7-bit environment, non-locking-shift, and non-single-shift.
2883 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2884 '(' must be omitted. We refer to this as "short-form" hereafter.
2886 Now you may notice that there are a lot of ways of encoding the
2887 same multilingual text in ISO2022. Actually, there exist many
2888 coding systems such as Compound Text (used in X11's inter client
2889 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2890 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2891 localized platforms), and all of these are variants of ISO2022.
2893 In addition to the above, Emacs handles two more kinds of escape
2894 sequences: ISO6429's direction specification and Emacs' private
2895 sequence for specifying character composition.
2897 ISO6429's direction specification takes the following form:
2898 o CSI ']' -- end of the current direction
2899 o CSI '0' ']' -- end of the current direction
2900 o CSI '1' ']' -- start of left-to-right text
2901 o CSI '2' ']' -- start of right-to-left text
2902 The control character CSI (0x9B: control sequence introducer) is
2903 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2905 Character composition specification takes the following form:
2906 o ESC '0' -- start relative composition
2907 o ESC '1' -- end composition
2908 o ESC '2' -- start rule-base composition (*)
2909 o ESC '3' -- start relative composition with alternate chars (**)
2910 o ESC '4' -- start rule-base composition with alternate chars (**)
2911 Since these are not standard escape sequences of any ISO standard,
2912 the use of them with these meanings is restricted to Emacs only.
2914 (*) This form is used only in Emacs 20.7 and older versions,
2915 but newer versions can safely decode it.
2916 (**) This form is used only in Emacs 21.1 and newer versions,
2917 and older versions can't decode it.
2919 Here's a list of example usages of these composition escape
2920 sequences (categorized by `enum composition_method').
2922 COMPOSITION_RELATIVE:
2923 ESC 0 CHAR [ CHAR ] ESC 1
2924 COMPOSITION_WITH_RULE:
2925 ESC 2 CHAR [ RULE CHAR ] ESC 1
2926 COMPOSITION_WITH_ALTCHARS:
2927 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2928 COMPOSITION_WITH_RULE_ALTCHARS:
2929 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2931 static enum iso_code_class_type iso_code_class[256];
2933 #define SAFE_CHARSET_P(coding, id) \
2934 ((id) <= (coding)->max_charset_id \
2935 && (coding)->safe_charsets[id] != 255)
2937 static void
2938 setup_iso_safe_charsets (Lisp_Object attrs)
2940 Lisp_Object charset_list, safe_charsets;
2941 Lisp_Object request;
2942 Lisp_Object reg_usage;
2943 Lisp_Object tail;
2944 EMACS_INT reg94, reg96;
2945 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2946 int max_charset_id;
2948 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2949 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2950 && ! EQ (charset_list, Viso_2022_charset_list))
2952 charset_list = Viso_2022_charset_list;
2953 ASET (attrs, coding_attr_charset_list, charset_list);
2954 ASET (attrs, coding_attr_safe_charsets, Qnil);
2957 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2958 return;
2960 max_charset_id = 0;
2961 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2963 int id = XINT (XCAR (tail));
2964 if (max_charset_id < id)
2965 max_charset_id = id;
2968 safe_charsets = make_uninit_string (max_charset_id + 1);
2969 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2970 request = AREF (attrs, coding_attr_iso_request);
2971 reg_usage = AREF (attrs, coding_attr_iso_usage);
2972 reg94 = XINT (XCAR (reg_usage));
2973 reg96 = XINT (XCDR (reg_usage));
2975 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2977 Lisp_Object id;
2978 Lisp_Object reg;
2979 struct charset *charset;
2981 id = XCAR (tail);
2982 charset = CHARSET_FROM_ID (XINT (id));
2983 reg = Fcdr (Fassq (id, request));
2984 if (! NILP (reg))
2985 SSET (safe_charsets, XINT (id), XINT (reg));
2986 else if (charset->iso_chars_96)
2988 if (reg96 < 4)
2989 SSET (safe_charsets, XINT (id), reg96);
2991 else
2993 if (reg94 < 4)
2994 SSET (safe_charsets, XINT (id), reg94);
2997 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3001 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3002 Return true if a text is encoded in one of ISO-2022 based coding
3003 systems. */
3005 static bool
3006 detect_coding_iso_2022 (struct coding_system *coding,
3007 struct coding_detection_info *detect_info)
3009 const unsigned char *src = coding->source, *src_base = src;
3010 const unsigned char *src_end = coding->source + coding->src_bytes;
3011 bool multibytep = coding->src_multibyte;
3012 bool single_shifting = 0;
3013 int id;
3014 int c, c1;
3015 ptrdiff_t consumed_chars = 0;
3016 int i;
3017 int rejected = 0;
3018 int found = 0;
3019 int composition_count = -1;
3021 detect_info->checked |= CATEGORY_MASK_ISO;
3023 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3025 struct coding_system *this = &(coding_categories[i]);
3026 Lisp_Object attrs, val;
3028 if (this->id < 0)
3029 continue;
3030 attrs = CODING_ID_ATTRS (this->id);
3031 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3032 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3033 setup_iso_safe_charsets (attrs);
3034 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3035 this->max_charset_id = SCHARS (val) - 1;
3036 this->safe_charsets = SDATA (val);
3039 /* A coding system of this category is always ASCII compatible. */
3040 src += coding->head_ascii;
3042 while (rejected != CATEGORY_MASK_ISO)
3044 src_base = src;
3045 ONE_MORE_BYTE (c);
3046 switch (c)
3048 case ISO_CODE_ESC:
3049 if (inhibit_iso_escape_detection)
3050 break;
3051 single_shifting = 0;
3052 ONE_MORE_BYTE (c);
3053 if (c == 'N' || c == 'O')
3055 /* ESC <Fe> for SS2 or SS3. */
3056 single_shifting = 1;
3057 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3059 else if (c == '1')
3061 /* End of composition. */
3062 if (composition_count < 0
3063 || composition_count > MAX_COMPOSITION_COMPONENTS)
3064 /* Invalid */
3065 break;
3066 composition_count = -1;
3067 found |= CATEGORY_MASK_ISO;
3069 else if (c >= '0' && c <= '4')
3071 /* ESC <Fp> for start/end composition. */
3072 composition_count = 0;
3074 else
3076 if (c >= '(' && c <= '/')
3078 /* Designation sequence for a charset of dimension 1. */
3079 ONE_MORE_BYTE (c1);
3080 if (c1 < ' ' || c1 >= 0x80
3081 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3083 /* Invalid designation sequence. Just ignore. */
3084 if (c1 >= 0x80)
3085 rejected |= (CATEGORY_MASK_ISO_7BIT
3086 | CATEGORY_MASK_ISO_7_ELSE);
3087 break;
3090 else if (c == '$')
3092 /* Designation sequence for a charset of dimension 2. */
3093 ONE_MORE_BYTE (c);
3094 if (c >= '@' && c <= 'B')
3095 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3096 id = iso_charset_table[1][0][c];
3097 else if (c >= '(' && c <= '/')
3099 ONE_MORE_BYTE (c1);
3100 if (c1 < ' ' || c1 >= 0x80
3101 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3103 /* Invalid designation sequence. Just ignore. */
3104 if (c1 >= 0x80)
3105 rejected |= (CATEGORY_MASK_ISO_7BIT
3106 | CATEGORY_MASK_ISO_7_ELSE);
3107 break;
3110 else
3112 /* Invalid designation sequence. Just ignore it. */
3113 if (c >= 0x80)
3114 rejected |= (CATEGORY_MASK_ISO_7BIT
3115 | CATEGORY_MASK_ISO_7_ELSE);
3116 break;
3119 else
3121 /* Invalid escape sequence. Just ignore it. */
3122 if (c >= 0x80)
3123 rejected |= (CATEGORY_MASK_ISO_7BIT
3124 | CATEGORY_MASK_ISO_7_ELSE);
3125 break;
3128 /* We found a valid designation sequence for CHARSET. */
3129 rejected |= CATEGORY_MASK_ISO_8BIT;
3130 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131 id))
3132 found |= CATEGORY_MASK_ISO_7;
3133 else
3134 rejected |= CATEGORY_MASK_ISO_7;
3135 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136 id))
3137 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138 else
3139 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141 id))
3142 found |= CATEGORY_MASK_ISO_7_ELSE;
3143 else
3144 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146 id))
3147 found |= CATEGORY_MASK_ISO_8_ELSE;
3148 else
3149 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3151 break;
3153 case ISO_CODE_SO:
3154 case ISO_CODE_SI:
3155 /* Locking shift out/in. */
3156 if (inhibit_iso_escape_detection)
3157 break;
3158 single_shifting = 0;
3159 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160 break;
3162 case ISO_CODE_CSI:
3163 /* Control sequence introducer. */
3164 single_shifting = 0;
3165 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166 found |= CATEGORY_MASK_ISO_8_ELSE;
3167 goto check_extra_latin;
3169 case ISO_CODE_SS2:
3170 case ISO_CODE_SS3:
3171 /* Single shift. */
3172 if (inhibit_iso_escape_detection)
3173 break;
3174 single_shifting = 0;
3175 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3176 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177 & CODING_ISO_FLAG_SINGLE_SHIFT)
3179 found |= CATEGORY_MASK_ISO_8_1;
3180 single_shifting = 1;
3182 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183 & CODING_ISO_FLAG_SINGLE_SHIFT)
3185 found |= CATEGORY_MASK_ISO_8_2;
3186 single_shifting = 1;
3188 if (single_shifting)
3189 break;
3190 goto check_extra_latin;
3192 default:
3193 if (c < 0)
3194 continue;
3195 if (c < 0x80)
3197 if (composition_count >= 0)
3198 composition_count++;
3199 single_shifting = 0;
3200 break;
3202 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3203 if (c >= 0xA0)
3205 found |= CATEGORY_MASK_ISO_8_1;
3206 /* Check the length of succeeding codes of the range
3207 0xA0..0FF. If the byte length is even, we include
3208 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3209 only when we are not single shifting. */
3210 if (! single_shifting
3211 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3213 ptrdiff_t len = 1;
3214 while (src < src_end)
3216 src_base = src;
3217 ONE_MORE_BYTE (c);
3218 if (c < 0xA0)
3220 src = src_base;
3221 break;
3223 len++;
3226 if (len & 1 && src < src_end)
3228 rejected |= CATEGORY_MASK_ISO_8_2;
3229 if (composition_count >= 0)
3230 composition_count += len;
3232 else
3234 found |= CATEGORY_MASK_ISO_8_2;
3235 if (composition_count >= 0)
3236 composition_count += len / 2;
3239 break;
3241 check_extra_latin:
3242 if (! VECTORP (Vlatin_extra_code_table)
3243 || NILP (AREF (Vlatin_extra_code_table, c)))
3245 rejected = CATEGORY_MASK_ISO;
3246 break;
3248 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3249 & CODING_ISO_FLAG_LATIN_EXTRA)
3250 found |= CATEGORY_MASK_ISO_8_1;
3251 else
3252 rejected |= CATEGORY_MASK_ISO_8_1;
3253 rejected |= CATEGORY_MASK_ISO_8_2;
3254 break;
3257 detect_info->rejected |= CATEGORY_MASK_ISO;
3258 return 0;
3260 no_more_source:
3261 detect_info->rejected |= rejected;
3262 detect_info->found |= (found & ~rejected);
3263 return 1;
3267 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3268 escape sequence should be kept. */
3269 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3270 do { \
3271 int id, prev; \
3273 if (final < '0' || final >= 128 \
3274 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3275 || !SAFE_CHARSET_P (coding, id)) \
3277 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3278 chars_96 = -1; \
3279 break; \
3281 prev = CODING_ISO_DESIGNATION (coding, reg); \
3282 if (id == charset_jisx0201_roman) \
3284 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3285 id = charset_ascii; \
3287 else if (id == charset_jisx0208_1978) \
3289 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3290 id = charset_jisx0208; \
3292 CODING_ISO_DESIGNATION (coding, reg) = id; \
3293 /* If there was an invalid designation to REG previously, and this \
3294 designation is ASCII to REG, we should keep this designation \
3295 sequence. */ \
3296 if (prev == -2 && id == charset_ascii) \
3297 chars_96 = -1; \
3298 } while (0)
3301 /* Handle these composition sequence (ALT: alternate char):
3303 (1) relative composition: ESC 0 CHAR ... ESC 1
3304 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3305 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3306 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3308 When the start sequence (ESC 0/2/3/4) is found, this annotation
3309 header is produced.
3311 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3313 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3314 produced until the end sequence (ESC 1) is found:
3316 (1) CHAR ... CHAR
3317 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3318 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3319 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3321 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3322 annotation header is updated as below:
3324 (1) LENGTH: unchanged, NCHARS: number of CHARs
3325 (2) LENGTH: unchanged, NCHARS: number of CHARs
3326 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3327 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3329 If an error is found while composing, the annotation header is
3330 changed to:
3332 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3334 and the sequence [ -2 DECODED-RULE ] is changed to the original
3335 byte sequence as below:
3336 o the original byte sequence is B: [ B -1 ]
3337 o the original byte sequence is B1 B2: [ B1 B2 ]
3338 and the sequence [ -1 -1 ] is changed to the original byte
3339 sequence:
3340 [ ESC '0' ]
3343 /* Decode a composition rule C1 and maybe one more byte from the
3344 source, and set RULE to the encoded composition rule. If the rule
3345 is invalid, goto invalid_code. */
3347 #define DECODE_COMPOSITION_RULE(rule) \
3348 do { \
3349 rule = c1 - 32; \
3350 if (rule < 0) \
3351 goto invalid_code; \
3352 if (rule < 81) /* old format (before ver.21) */ \
3354 int gref = (rule) / 9; \
3355 int nref = (rule) % 9; \
3356 if (gref == 4) gref = 10; \
3357 if (nref == 4) nref = 10; \
3358 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3360 else /* new format (after ver.21) */ \
3362 int b; \
3364 ONE_MORE_BYTE (b); \
3365 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3366 goto invalid_code; \
3367 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3368 rule += 0x100; /* Distinguish it from the old format. */ \
3370 } while (0)
3372 #define ENCODE_COMPOSITION_RULE(rule) \
3373 do { \
3374 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3376 if (rule < 0x100) /* old format */ \
3378 if (gref == 10) gref = 4; \
3379 if (nref == 10) nref = 4; \
3380 charbuf[idx] = 32 + gref * 9 + nref; \
3381 charbuf[idx + 1] = -1; \
3382 new_chars++; \
3384 else /* new format */ \
3386 charbuf[idx] = 32 + 81 + gref; \
3387 charbuf[idx + 1] = 32 + nref; \
3388 new_chars += 2; \
3390 } while (0)
3392 /* Finish the current composition as invalid. */
3394 static int
3395 finish_composition (int *charbuf, struct composition_status *cmp_status)
3397 int idx = - cmp_status->length;
3398 int new_chars;
3400 /* Recover the original ESC sequence */
3401 charbuf[idx++] = ISO_CODE_ESC;
3402 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3403 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3404 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3405 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3406 : '4');
3407 charbuf[idx++] = -2;
3408 charbuf[idx++] = 0;
3409 charbuf[idx++] = -1;
3410 new_chars = cmp_status->nchars;
3411 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3412 for (; idx < 0; idx++)
3414 int elt = charbuf[idx];
3416 if (elt == -2)
3418 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3419 idx++;
3421 else if (elt == -1)
3423 charbuf[idx++] = ISO_CODE_ESC;
3424 charbuf[idx] = '0';
3425 new_chars += 2;
3428 cmp_status->state = COMPOSING_NO;
3429 return new_chars;
3432 /* If characters are under composition, finish the composition. */
3433 #define MAYBE_FINISH_COMPOSITION() \
3434 do { \
3435 if (cmp_status->state != COMPOSING_NO) \
3436 char_offset += finish_composition (charbuf, cmp_status); \
3437 } while (0)
3439 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3441 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3442 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3443 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3444 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3446 Produce this annotation sequence now:
3448 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3451 #define DECODE_COMPOSITION_START(c1) \
3452 do { \
3453 if (c1 == '0' \
3454 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3455 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3456 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3457 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3459 *charbuf++ = -1; \
3460 *charbuf++= -1; \
3461 cmp_status->state = COMPOSING_CHAR; \
3462 cmp_status->length += 2; \
3464 else \
3466 MAYBE_FINISH_COMPOSITION (); \
3467 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3468 : c1 == '2' ? COMPOSITION_WITH_RULE \
3469 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3470 : COMPOSITION_WITH_RULE_ALTCHARS); \
3471 cmp_status->state \
3472 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3473 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3474 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3475 cmp_status->nchars = cmp_status->ncomps = 0; \
3476 coding->annotated = 1; \
3478 } while (0)
3481 /* Handle composition end sequence ESC 1. */
3483 #define DECODE_COMPOSITION_END() \
3484 do { \
3485 if (cmp_status->nchars == 0 \
3486 || ((cmp_status->state == COMPOSING_CHAR) \
3487 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3489 MAYBE_FINISH_COMPOSITION (); \
3490 goto invalid_code; \
3492 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3493 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3494 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3495 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3496 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3497 char_offset += cmp_status->nchars; \
3498 cmp_status->state = COMPOSING_NO; \
3499 } while (0)
3501 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3503 #define STORE_COMPOSITION_RULE(rule) \
3504 do { \
3505 *charbuf++ = -2; \
3506 *charbuf++ = rule; \
3507 cmp_status->length += 2; \
3508 cmp_status->state--; \
3509 } while (0)
3511 /* Store a composed char or a component char C in charbuf, and update
3512 cmp_status. */
3514 #define STORE_COMPOSITION_CHAR(c) \
3515 do { \
3516 *charbuf++ = (c); \
3517 cmp_status->length++; \
3518 if (cmp_status->state == COMPOSING_CHAR) \
3519 cmp_status->nchars++; \
3520 else \
3521 cmp_status->ncomps++; \
3522 if (cmp_status->method == COMPOSITION_WITH_RULE \
3523 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3524 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3525 cmp_status->state++; \
3526 } while (0)
3529 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3531 static void
3532 decode_coding_iso_2022 (struct coding_system *coding)
3534 const unsigned char *src = coding->source + coding->consumed;
3535 const unsigned char *src_end = coding->source + coding->src_bytes;
3536 const unsigned char *src_base;
3537 int *charbuf = coding->charbuf + coding->charbuf_used;
3538 /* We may produce two annotations (charset and composition) in one
3539 loop and one more charset annotation at the end. */
3540 int *charbuf_end
3541 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3542 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3543 bool multibytep = coding->src_multibyte;
3544 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3545 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3546 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3547 int charset_id_2, charset_id_3;
3548 struct charset *charset;
3549 int c;
3550 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3551 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3552 ptrdiff_t char_offset = coding->produced_char;
3553 ptrdiff_t last_offset = char_offset;
3554 int last_id = charset_ascii;
3555 bool eol_dos
3556 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3557 int byte_after_cr = -1;
3558 int i;
3560 setup_iso_safe_charsets (attrs);
3561 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3563 if (cmp_status->state != COMPOSING_NO)
3565 if (charbuf_end - charbuf < cmp_status->length)
3566 emacs_abort ();
3567 for (i = 0; i < cmp_status->length; i++)
3568 *charbuf++ = cmp_status->carryover[i];
3569 coding->annotated = 1;
3572 while (1)
3574 int c1, c2, c3;
3576 src_base = src;
3577 consumed_chars_base = consumed_chars;
3579 if (charbuf >= charbuf_end)
3581 if (byte_after_cr >= 0)
3582 src_base--;
3583 break;
3586 if (byte_after_cr >= 0)
3587 c1 = byte_after_cr, byte_after_cr = -1;
3588 else
3589 ONE_MORE_BYTE (c1);
3590 if (c1 < 0)
3591 goto invalid_code;
3593 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3595 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3596 char_offset++;
3597 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3598 continue;
3601 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3603 if (c1 == ISO_CODE_ESC)
3605 if (src + 1 >= src_end)
3606 goto no_more_source;
3607 *charbuf++ = ISO_CODE_ESC;
3608 char_offset++;
3609 if (src[0] == '%' && src[1] == '@')
3611 src += 2;
3612 consumed_chars += 2;
3613 char_offset += 2;
3614 /* We are sure charbuf can contain two more chars. */
3615 *charbuf++ = '%';
3616 *charbuf++ = '@';
3617 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3620 else
3622 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3623 char_offset++;
3625 continue;
3628 if ((cmp_status->state == COMPOSING_RULE
3629 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3630 && c1 != ISO_CODE_ESC)
3632 int rule;
3634 DECODE_COMPOSITION_RULE (rule);
3635 STORE_COMPOSITION_RULE (rule);
3636 continue;
3639 /* We produce at most one character. */
3640 switch (iso_code_class [c1])
3642 case ISO_0x20_or_0x7F:
3643 if (charset_id_0 < 0
3644 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3645 /* This is SPACE or DEL. */
3646 charset = CHARSET_FROM_ID (charset_ascii);
3647 else
3648 charset = CHARSET_FROM_ID (charset_id_0);
3649 break;
3651 case ISO_graphic_plane_0:
3652 if (charset_id_0 < 0)
3653 charset = CHARSET_FROM_ID (charset_ascii);
3654 else
3655 charset = CHARSET_FROM_ID (charset_id_0);
3656 break;
3658 case ISO_0xA0_or_0xFF:
3659 if (charset_id_1 < 0
3660 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3661 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3662 goto invalid_code;
3663 /* This is a graphic character, we fall down ... */
3665 case ISO_graphic_plane_1:
3666 if (charset_id_1 < 0)
3667 goto invalid_code;
3668 charset = CHARSET_FROM_ID (charset_id_1);
3669 break;
3671 case ISO_control_0:
3672 if (eol_dos && c1 == '\r')
3673 ONE_MORE_BYTE (byte_after_cr);
3674 MAYBE_FINISH_COMPOSITION ();
3675 charset = CHARSET_FROM_ID (charset_ascii);
3676 break;
3678 case ISO_control_1:
3679 goto invalid_code;
3681 case ISO_shift_out:
3682 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3683 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3684 goto invalid_code;
3685 CODING_ISO_INVOCATION (coding, 0) = 1;
3686 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3687 continue;
3689 case ISO_shift_in:
3690 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3691 goto invalid_code;
3692 CODING_ISO_INVOCATION (coding, 0) = 0;
3693 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3694 continue;
3696 case ISO_single_shift_2_7:
3697 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3698 goto invalid_code;
3699 case ISO_single_shift_2:
3700 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3701 goto invalid_code;
3702 /* SS2 is handled as an escape sequence of ESC 'N' */
3703 c1 = 'N';
3704 goto label_escape_sequence;
3706 case ISO_single_shift_3:
3707 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3708 goto invalid_code;
3709 /* SS2 is handled as an escape sequence of ESC 'O' */
3710 c1 = 'O';
3711 goto label_escape_sequence;
3713 case ISO_control_sequence_introducer:
3714 /* CSI is handled as an escape sequence of ESC '[' ... */
3715 c1 = '[';
3716 goto label_escape_sequence;
3718 case ISO_escape:
3719 ONE_MORE_BYTE (c1);
3720 label_escape_sequence:
3721 /* Escape sequences handled here are invocation,
3722 designation, direction specification, and character
3723 composition specification. */
3724 switch (c1)
3726 case '&': /* revision of following character set */
3727 ONE_MORE_BYTE (c1);
3728 if (!(c1 >= '@' && c1 <= '~'))
3729 goto invalid_code;
3730 ONE_MORE_BYTE (c1);
3731 if (c1 != ISO_CODE_ESC)
3732 goto invalid_code;
3733 ONE_MORE_BYTE (c1);
3734 goto label_escape_sequence;
3736 case '$': /* designation of 2-byte character set */
3737 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3738 goto invalid_code;
3740 int reg, chars96;
3742 ONE_MORE_BYTE (c1);
3743 if (c1 >= '@' && c1 <= 'B')
3744 { /* designation of JISX0208.1978, GB2312.1980,
3745 or JISX0208.1980 */
3746 reg = 0, chars96 = 0;
3748 else if (c1 >= 0x28 && c1 <= 0x2B)
3749 { /* designation of DIMENSION2_CHARS94 character set */
3750 reg = c1 - 0x28, chars96 = 0;
3751 ONE_MORE_BYTE (c1);
3753 else if (c1 >= 0x2C && c1 <= 0x2F)
3754 { /* designation of DIMENSION2_CHARS96 character set */
3755 reg = c1 - 0x2C, chars96 = 1;
3756 ONE_MORE_BYTE (c1);
3758 else
3759 goto invalid_code;
3760 DECODE_DESIGNATION (reg, 2, chars96, c1);
3761 /* We must update these variables now. */
3762 if (reg == 0)
3763 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3764 else if (reg == 1)
3765 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3766 if (chars96 < 0)
3767 goto invalid_code;
3769 continue;
3771 case 'n': /* invocation of locking-shift-2 */
3772 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3773 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3774 goto invalid_code;
3775 CODING_ISO_INVOCATION (coding, 0) = 2;
3776 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3777 continue;
3779 case 'o': /* invocation of locking-shift-3 */
3780 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3781 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3782 goto invalid_code;
3783 CODING_ISO_INVOCATION (coding, 0) = 3;
3784 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3785 continue;
3787 case 'N': /* invocation of single-shift-2 */
3788 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3789 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3790 goto invalid_code;
3791 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3792 if (charset_id_2 < 0)
3793 charset = CHARSET_FROM_ID (charset_ascii);
3794 else
3795 charset = CHARSET_FROM_ID (charset_id_2);
3796 ONE_MORE_BYTE (c1);
3797 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3798 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3799 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3800 ? c1 >= 0x80 : c1 < 0x80)))
3801 goto invalid_code;
3802 break;
3804 case 'O': /* invocation of single-shift-3 */
3805 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3806 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3807 goto invalid_code;
3808 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3809 if (charset_id_3 < 0)
3810 charset = CHARSET_FROM_ID (charset_ascii);
3811 else
3812 charset = CHARSET_FROM_ID (charset_id_3);
3813 ONE_MORE_BYTE (c1);
3814 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3815 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3816 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3817 ? c1 >= 0x80 : c1 < 0x80)))
3818 goto invalid_code;
3819 break;
3821 case '0': case '2': case '3': case '4': /* start composition */
3822 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3823 goto invalid_code;
3824 if (last_id != charset_ascii)
3826 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3827 last_id = charset_ascii;
3828 last_offset = char_offset;
3830 DECODE_COMPOSITION_START (c1);
3831 continue;
3833 case '1': /* end composition */
3834 if (cmp_status->state == COMPOSING_NO)
3835 goto invalid_code;
3836 DECODE_COMPOSITION_END ();
3837 continue;
3839 case '[': /* specification of direction */
3840 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3841 goto invalid_code;
3842 /* For the moment, nested direction is not supported.
3843 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3844 left-to-right, and nonzero means right-to-left. */
3845 ONE_MORE_BYTE (c1);
3846 switch (c1)
3848 case ']': /* end of the current direction */
3849 coding->mode &= ~CODING_MODE_DIRECTION;
3851 case '0': /* end of the current direction */
3852 case '1': /* start of left-to-right direction */
3853 ONE_MORE_BYTE (c1);
3854 if (c1 == ']')
3855 coding->mode &= ~CODING_MODE_DIRECTION;
3856 else
3857 goto invalid_code;
3858 break;
3860 case '2': /* start of right-to-left direction */
3861 ONE_MORE_BYTE (c1);
3862 if (c1 == ']')
3863 coding->mode |= CODING_MODE_DIRECTION;
3864 else
3865 goto invalid_code;
3866 break;
3868 default:
3869 goto invalid_code;
3871 continue;
3873 case '%':
3874 ONE_MORE_BYTE (c1);
3875 if (c1 == '/')
3877 /* CTEXT extended segment:
3878 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3879 We keep these bytes as is for the moment.
3880 They may be decoded by post-read-conversion. */
3881 int dim, M, L;
3882 int size;
3884 ONE_MORE_BYTE (dim);
3885 if (dim < '0' || dim > '4')
3886 goto invalid_code;
3887 ONE_MORE_BYTE (M);
3888 if (M < 128)
3889 goto invalid_code;
3890 ONE_MORE_BYTE (L);
3891 if (L < 128)
3892 goto invalid_code;
3893 size = ((M - 128) * 128) + (L - 128);
3894 if (charbuf + 6 > charbuf_end)
3895 goto break_loop;
3896 *charbuf++ = ISO_CODE_ESC;
3897 *charbuf++ = '%';
3898 *charbuf++ = '/';
3899 *charbuf++ = dim;
3900 *charbuf++ = BYTE8_TO_CHAR (M);
3901 *charbuf++ = BYTE8_TO_CHAR (L);
3902 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3904 else if (c1 == 'G')
3906 /* XFree86 extension for embedding UTF-8 in CTEXT:
3907 ESC % G --UTF-8-BYTES-- ESC % @
3908 We keep these bytes as is for the moment.
3909 They may be decoded by post-read-conversion. */
3910 if (charbuf + 3 > charbuf_end)
3911 goto break_loop;
3912 *charbuf++ = ISO_CODE_ESC;
3913 *charbuf++ = '%';
3914 *charbuf++ = 'G';
3915 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3917 else
3918 goto invalid_code;
3919 continue;
3920 break;
3922 default:
3923 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3924 goto invalid_code;
3926 int reg, chars96;
3928 if (c1 >= 0x28 && c1 <= 0x2B)
3929 { /* designation of DIMENSION1_CHARS94 character set */
3930 reg = c1 - 0x28, chars96 = 0;
3931 ONE_MORE_BYTE (c1);
3933 else if (c1 >= 0x2C && c1 <= 0x2F)
3934 { /* designation of DIMENSION1_CHARS96 character set */
3935 reg = c1 - 0x2C, chars96 = 1;
3936 ONE_MORE_BYTE (c1);
3938 else
3939 goto invalid_code;
3940 DECODE_DESIGNATION (reg, 1, chars96, c1);
3941 /* We must update these variables now. */
3942 if (reg == 0)
3943 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3944 else if (reg == 1)
3945 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3946 if (chars96 < 0)
3947 goto invalid_code;
3949 continue;
3951 break;
3953 default:
3954 emacs_abort ();
3957 if (cmp_status->state == COMPOSING_NO
3958 && charset->id != charset_ascii
3959 && last_id != charset->id)
3961 if (last_id != charset_ascii)
3962 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3963 last_id = charset->id;
3964 last_offset = char_offset;
3967 /* Now we know CHARSET and 1st position code C1 of a character.
3968 Produce a decoded character while getting 2nd and 3rd
3969 position codes C2, C3 if necessary. */
3970 if (CHARSET_DIMENSION (charset) > 1)
3972 ONE_MORE_BYTE (c2);
3973 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3974 || ((c1 & 0x80) != (c2 & 0x80)))
3975 /* C2 is not in a valid range. */
3976 goto invalid_code;
3977 if (CHARSET_DIMENSION (charset) == 2)
3978 c1 = (c1 << 8) | c2;
3979 else
3981 ONE_MORE_BYTE (c3);
3982 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3983 || ((c1 & 0x80) != (c3 & 0x80)))
3984 /* C3 is not in a valid range. */
3985 goto invalid_code;
3986 c1 = (c1 << 16) | (c2 << 8) | c2;
3989 c1 &= 0x7F7F7F;
3990 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3991 if (c < 0)
3993 MAYBE_FINISH_COMPOSITION ();
3994 for (; src_base < src; src_base++, char_offset++)
3996 if (ASCII_BYTE_P (*src_base))
3997 *charbuf++ = *src_base;
3998 else
3999 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4002 else if (cmp_status->state == COMPOSING_NO)
4004 *charbuf++ = c;
4005 char_offset++;
4007 else if ((cmp_status->state == COMPOSING_CHAR
4008 ? cmp_status->nchars
4009 : cmp_status->ncomps)
4010 >= MAX_COMPOSITION_COMPONENTS)
4012 /* Too long composition. */
4013 MAYBE_FINISH_COMPOSITION ();
4014 *charbuf++ = c;
4015 char_offset++;
4017 else
4018 STORE_COMPOSITION_CHAR (c);
4019 continue;
4021 invalid_code:
4022 MAYBE_FINISH_COMPOSITION ();
4023 src = src_base;
4024 consumed_chars = consumed_chars_base;
4025 ONE_MORE_BYTE (c);
4026 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4027 char_offset++;
4028 coding->errors++;
4029 /* Reset the invocation and designation status to the safest
4030 one; i.e. designate ASCII to the graphic register 0, and
4031 invoke that register to the graphic plane 0. This typically
4032 helps the case that an designation sequence for ASCII "ESC (
4033 B" is somehow broken (e.g. broken by a newline). */
4034 CODING_ISO_INVOCATION (coding, 0) = 0;
4035 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4036 charset_id_0 = charset_ascii;
4037 continue;
4039 break_loop:
4040 break;
4043 no_more_source:
4044 if (cmp_status->state != COMPOSING_NO)
4046 if (coding->mode & CODING_MODE_LAST_BLOCK)
4047 MAYBE_FINISH_COMPOSITION ();
4048 else
4050 charbuf -= cmp_status->length;
4051 for (i = 0; i < cmp_status->length; i++)
4052 cmp_status->carryover[i] = charbuf[i];
4055 else if (last_id != charset_ascii)
4056 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4057 coding->consumed_char += consumed_chars_base;
4058 coding->consumed = src_base - coding->source;
4059 coding->charbuf_used = charbuf - coding->charbuf;
4063 /* ISO2022 encoding stuff. */
4066 It is not enough to say just "ISO2022" on encoding, we have to
4067 specify more details. In Emacs, each coding system of ISO2022
4068 variant has the following specifications:
4069 1. Initial designation to G0 thru G3.
4070 2. Allows short-form designation?
4071 3. ASCII should be designated to G0 before control characters?
4072 4. ASCII should be designated to G0 at end of line?
4073 5. 7-bit environment or 8-bit environment?
4074 6. Use locking-shift?
4075 7. Use Single-shift?
4076 And the following two are only for Japanese:
4077 8. Use ASCII in place of JIS0201-1976-Roman?
4078 9. Use JISX0208-1983 in place of JISX0208-1978?
4079 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4080 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4081 details.
4084 /* Produce codes (escape sequence) for designating CHARSET to graphic
4085 register REG at DST, and increment DST. If <final-char> of CHARSET is
4086 '@', 'A', or 'B' and the coding system CODING allows, produce
4087 designation sequence of short-form. */
4089 #define ENCODE_DESIGNATION(charset, reg, coding) \
4090 do { \
4091 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4092 const char *intermediate_char_94 = "()*+"; \
4093 const char *intermediate_char_96 = ",-./"; \
4094 int revision = -1; \
4096 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4097 revision = CHARSET_ISO_REVISION (charset); \
4099 if (revision >= 0) \
4101 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4102 EMIT_ONE_BYTE ('@' + revision); \
4104 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4105 if (CHARSET_DIMENSION (charset) == 1) \
4107 int b; \
4108 if (! CHARSET_ISO_CHARS_96 (charset)) \
4109 b = intermediate_char_94[reg]; \
4110 else \
4111 b = intermediate_char_96[reg]; \
4112 EMIT_ONE_ASCII_BYTE (b); \
4114 else \
4116 EMIT_ONE_ASCII_BYTE ('$'); \
4117 if (! CHARSET_ISO_CHARS_96 (charset)) \
4119 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4120 || reg != 0 \
4121 || final_char < '@' || final_char > 'B') \
4122 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4124 else \
4125 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4127 EMIT_ONE_ASCII_BYTE (final_char); \
4129 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4130 } while (0)
4133 /* The following two macros produce codes (control character or escape
4134 sequence) for ISO2022 single-shift functions (single-shift-2 and
4135 single-shift-3). */
4137 #define ENCODE_SINGLE_SHIFT_2 \
4138 do { \
4139 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4140 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4141 else \
4142 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4143 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4144 } while (0)
4147 #define ENCODE_SINGLE_SHIFT_3 \
4148 do { \
4149 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4150 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4151 else \
4152 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4153 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4154 } while (0)
4157 /* The following four macros produce codes (control character or
4158 escape sequence) for ISO2022 locking-shift functions (shift-in,
4159 shift-out, locking-shift-2, and locking-shift-3). */
4161 #define ENCODE_SHIFT_IN \
4162 do { \
4163 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4164 CODING_ISO_INVOCATION (coding, 0) = 0; \
4165 } while (0)
4168 #define ENCODE_SHIFT_OUT \
4169 do { \
4170 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4171 CODING_ISO_INVOCATION (coding, 0) = 1; \
4172 } while (0)
4175 #define ENCODE_LOCKING_SHIFT_2 \
4176 do { \
4177 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4178 CODING_ISO_INVOCATION (coding, 0) = 2; \
4179 } while (0)
4182 #define ENCODE_LOCKING_SHIFT_3 \
4183 do { \
4184 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4185 CODING_ISO_INVOCATION (coding, 0) = 3; \
4186 } while (0)
4189 /* Produce codes for a DIMENSION1 character whose character set is
4190 CHARSET and whose position-code is C1. Designation and invocation
4191 sequences are also produced in advance if necessary. */
4193 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4194 do { \
4195 int id = CHARSET_ID (charset); \
4197 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4198 && id == charset_ascii) \
4200 id = charset_jisx0201_roman; \
4201 charset = CHARSET_FROM_ID (id); \
4204 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4207 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4208 else \
4209 EMIT_ONE_BYTE (c1 | 0x80); \
4210 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4211 break; \
4213 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4215 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4216 break; \
4218 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4220 EMIT_ONE_BYTE (c1 | 0x80); \
4221 break; \
4223 else \
4224 /* Since CHARSET is not yet invoked to any graphic planes, we \
4225 must invoke it, or, at first, designate it to some graphic \
4226 register. Then repeat the loop to actually produce the \
4227 character. */ \
4228 dst = encode_invocation_designation (charset, coding, dst, \
4229 &produced_chars); \
4230 } while (1)
4233 /* Produce codes for a DIMENSION2 character whose character set is
4234 CHARSET and whose position-codes are C1 and C2. Designation and
4235 invocation codes are also produced in advance if necessary. */
4237 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4238 do { \
4239 int id = CHARSET_ID (charset); \
4241 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4242 && id == charset_jisx0208) \
4244 id = charset_jisx0208_1978; \
4245 charset = CHARSET_FROM_ID (id); \
4248 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4250 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4251 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4252 else \
4253 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4254 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4255 break; \
4257 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4259 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4260 break; \
4262 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4264 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4265 break; \
4267 else \
4268 /* Since CHARSET is not yet invoked to any graphic planes, we \
4269 must invoke it, or, at first, designate it to some graphic \
4270 register. Then repeat the loop to actually produce the \
4271 character. */ \
4272 dst = encode_invocation_designation (charset, coding, dst, \
4273 &produced_chars); \
4274 } while (1)
4277 #define ENCODE_ISO_CHARACTER(charset, c) \
4278 do { \
4279 unsigned code; \
4280 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4282 if (CHARSET_DIMENSION (charset) == 1) \
4283 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4284 else \
4285 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4286 } while (0)
4289 /* Produce designation and invocation codes at a place pointed by DST
4290 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4291 Return new DST. */
4293 static unsigned char *
4294 encode_invocation_designation (struct charset *charset,
4295 struct coding_system *coding,
4296 unsigned char *dst, ptrdiff_t *p_nchars)
4298 bool multibytep = coding->dst_multibyte;
4299 ptrdiff_t produced_chars = *p_nchars;
4300 int reg; /* graphic register number */
4301 int id = CHARSET_ID (charset);
4303 /* At first, check designations. */
4304 for (reg = 0; reg < 4; reg++)
4305 if (id == CODING_ISO_DESIGNATION (coding, reg))
4306 break;
4308 if (reg >= 4)
4310 /* CHARSET is not yet designated to any graphic registers. */
4311 /* At first check the requested designation. */
4312 reg = CODING_ISO_REQUEST (coding, id);
4313 if (reg < 0)
4314 /* Since CHARSET requests no special designation, designate it
4315 to graphic register 0. */
4316 reg = 0;
4318 ENCODE_DESIGNATION (charset, reg, coding);
4321 if (CODING_ISO_INVOCATION (coding, 0) != reg
4322 && CODING_ISO_INVOCATION (coding, 1) != reg)
4324 /* Since the graphic register REG is not invoked to any graphic
4325 planes, invoke it to graphic plane 0. */
4326 switch (reg)
4328 case 0: /* graphic register 0 */
4329 ENCODE_SHIFT_IN;
4330 break;
4332 case 1: /* graphic register 1 */
4333 ENCODE_SHIFT_OUT;
4334 break;
4336 case 2: /* graphic register 2 */
4337 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4338 ENCODE_SINGLE_SHIFT_2;
4339 else
4340 ENCODE_LOCKING_SHIFT_2;
4341 break;
4343 case 3: /* graphic register 3 */
4344 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4345 ENCODE_SINGLE_SHIFT_3;
4346 else
4347 ENCODE_LOCKING_SHIFT_3;
4348 break;
4352 *p_nchars = produced_chars;
4353 return dst;
4357 /* Produce codes for designation and invocation to reset the graphic
4358 planes and registers to initial state. */
4359 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4360 do { \
4361 int reg; \
4362 struct charset *charset; \
4364 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4365 ENCODE_SHIFT_IN; \
4366 for (reg = 0; reg < 4; reg++) \
4367 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4368 && (CODING_ISO_DESIGNATION (coding, reg) \
4369 != CODING_ISO_INITIAL (coding, reg))) \
4371 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4372 ENCODE_DESIGNATION (charset, reg, coding); \
4374 } while (0)
4377 /* Produce designation sequences of charsets in the line started from
4378 CHARBUF to a place pointed by DST, and return the number of
4379 produced bytes. DST should not directly point a buffer text area
4380 which may be relocated by char_charset call.
4382 If the current block ends before any end-of-line, we may fail to
4383 find all the necessary designations. */
4385 static ptrdiff_t
4386 encode_designation_at_bol (struct coding_system *coding,
4387 int *charbuf, int *charbuf_end,
4388 unsigned char *dst)
4390 unsigned char *orig = dst;
4391 struct charset *charset;
4392 /* Table of charsets to be designated to each graphic register. */
4393 int r[4];
4394 int c, found = 0, reg;
4395 ptrdiff_t produced_chars = 0;
4396 bool multibytep = coding->dst_multibyte;
4397 Lisp_Object attrs;
4398 Lisp_Object charset_list;
4400 attrs = CODING_ID_ATTRS (coding->id);
4401 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4402 if (EQ (charset_list, Qiso_2022))
4403 charset_list = Viso_2022_charset_list;
4405 for (reg = 0; reg < 4; reg++)
4406 r[reg] = -1;
4408 while (charbuf < charbuf_end && found < 4)
4410 int id;
4412 c = *charbuf++;
4413 if (c == '\n')
4414 break;
4415 charset = char_charset (c, charset_list, NULL);
4416 id = CHARSET_ID (charset);
4417 reg = CODING_ISO_REQUEST (coding, id);
4418 if (reg >= 0 && r[reg] < 0)
4420 found++;
4421 r[reg] = id;
4425 if (found)
4427 for (reg = 0; reg < 4; reg++)
4428 if (r[reg] >= 0
4429 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4430 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4433 return dst - orig;
4436 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4438 static bool
4439 encode_coding_iso_2022 (struct coding_system *coding)
4441 bool multibytep = coding->dst_multibyte;
4442 int *charbuf = coding->charbuf;
4443 int *charbuf_end = charbuf + coding->charbuf_used;
4444 unsigned char *dst = coding->destination + coding->produced;
4445 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4446 int safe_room = 16;
4447 bool bol_designation
4448 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4449 && CODING_ISO_BOL (coding));
4450 ptrdiff_t produced_chars = 0;
4451 Lisp_Object attrs, eol_type, charset_list;
4452 bool ascii_compatible;
4453 int c;
4454 int preferred_charset_id = -1;
4456 CODING_GET_INFO (coding, attrs, charset_list);
4457 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4458 if (VECTORP (eol_type))
4459 eol_type = Qunix;
4461 setup_iso_safe_charsets (attrs);
4462 /* Charset list may have been changed. */
4463 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4464 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4466 ascii_compatible
4467 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4468 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4469 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4471 while (charbuf < charbuf_end)
4473 ASSURE_DESTINATION (safe_room);
4475 if (bol_designation)
4477 /* We have to produce designation sequences if any now. */
4478 unsigned char desig_buf[16];
4479 ptrdiff_t nbytes;
4480 ptrdiff_t offset;
4482 charset_map_loaded = 0;
4483 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4484 desig_buf);
4485 if (charset_map_loaded
4486 && (offset = coding_change_destination (coding)))
4488 dst += offset;
4489 dst_end += offset;
4491 memcpy (dst, desig_buf, nbytes);
4492 dst += nbytes;
4493 /* We are sure that designation sequences are all ASCII bytes. */
4494 produced_chars += nbytes;
4495 bol_designation = 0;
4496 ASSURE_DESTINATION (safe_room);
4499 c = *charbuf++;
4501 if (c < 0)
4503 /* Handle an annotation. */
4504 switch (*charbuf)
4506 case CODING_ANNOTATE_COMPOSITION_MASK:
4507 /* Not yet implemented. */
4508 break;
4509 case CODING_ANNOTATE_CHARSET_MASK:
4510 preferred_charset_id = charbuf[2];
4511 if (preferred_charset_id >= 0
4512 && NILP (Fmemq (make_number (preferred_charset_id),
4513 charset_list)))
4514 preferred_charset_id = -1;
4515 break;
4516 default:
4517 emacs_abort ();
4519 charbuf += -c - 1;
4520 continue;
4523 /* Now encode the character C. */
4524 if (c < 0x20 || c == 0x7F)
4526 if (c == '\n'
4527 || (c == '\r' && EQ (eol_type, Qmac)))
4529 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4530 ENCODE_RESET_PLANE_AND_REGISTER ();
4531 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4533 int i;
4535 for (i = 0; i < 4; i++)
4536 CODING_ISO_DESIGNATION (coding, i)
4537 = CODING_ISO_INITIAL (coding, i);
4539 bol_designation = ((CODING_ISO_FLAGS (coding)
4540 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4541 != 0);
4543 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4544 ENCODE_RESET_PLANE_AND_REGISTER ();
4545 EMIT_ONE_ASCII_BYTE (c);
4547 else if (ASCII_CHAR_P (c))
4549 if (ascii_compatible)
4550 EMIT_ONE_ASCII_BYTE (c);
4551 else
4553 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4554 ENCODE_ISO_CHARACTER (charset, c);
4557 else if (CHAR_BYTE8_P (c))
4559 c = CHAR_TO_BYTE8 (c);
4560 EMIT_ONE_BYTE (c);
4562 else
4564 struct charset *charset;
4566 if (preferred_charset_id >= 0)
4568 bool result;
4570 charset = CHARSET_FROM_ID (preferred_charset_id);
4571 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4572 if (! result)
4573 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4574 NULL, charset);
4576 else
4577 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4578 NULL, charset);
4579 if (!charset)
4581 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4583 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4584 charset = CHARSET_FROM_ID (charset_ascii);
4586 else
4588 c = coding->default_char;
4589 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4590 charset_list, NULL, charset);
4593 ENCODE_ISO_CHARACTER (charset, c);
4597 if (coding->mode & CODING_MODE_LAST_BLOCK
4598 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4600 ASSURE_DESTINATION (safe_room);
4601 ENCODE_RESET_PLANE_AND_REGISTER ();
4603 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4604 CODING_ISO_BOL (coding) = bol_designation;
4605 coding->produced_char += produced_chars;
4606 coding->produced = dst - coding->destination;
4607 return 0;
4611 /*** 8,9. SJIS and BIG5 handlers ***/
4613 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4614 quite widely. So, for the moment, Emacs supports them in the bare
4615 C code. But, in the future, they may be supported only by CCL. */
4617 /* SJIS is a coding system encoding three character sets: ASCII, right
4618 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4619 as is. A character of charset katakana-jisx0201 is encoded by
4620 "position-code + 0x80". A character of charset japanese-jisx0208
4621 is encoded in 2-byte but two position-codes are divided and shifted
4622 so that it fit in the range below.
4624 --- CODE RANGE of SJIS ---
4625 (character set) (range)
4626 ASCII 0x00 .. 0x7F
4627 KATAKANA-JISX0201 0xA0 .. 0xDF
4628 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4629 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4630 -------------------------------
4634 /* BIG5 is a coding system encoding two character sets: ASCII and
4635 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4636 character set and is encoded in two-byte.
4638 --- CODE RANGE of BIG5 ---
4639 (character set) (range)
4640 ASCII 0x00 .. 0x7F
4641 Big5 (1st byte) 0xA1 .. 0xFE
4642 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4643 --------------------------
4647 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4648 Return true if a text is encoded in SJIS. */
4650 static bool
4651 detect_coding_sjis (struct coding_system *coding,
4652 struct coding_detection_info *detect_info)
4654 const unsigned char *src = coding->source, *src_base;
4655 const unsigned char *src_end = coding->source + coding->src_bytes;
4656 bool multibytep = coding->src_multibyte;
4657 ptrdiff_t consumed_chars = 0;
4658 int found = 0;
4659 int c;
4660 Lisp_Object attrs, charset_list;
4661 int max_first_byte_of_2_byte_code;
4663 CODING_GET_INFO (coding, attrs, charset_list);
4664 max_first_byte_of_2_byte_code
4665 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4667 detect_info->checked |= CATEGORY_MASK_SJIS;
4668 /* A coding system of this category is always ASCII compatible. */
4669 src += coding->head_ascii;
4671 while (1)
4673 src_base = src;
4674 ONE_MORE_BYTE (c);
4675 if (c < 0x80)
4676 continue;
4677 if ((c >= 0x81 && c <= 0x9F)
4678 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4680 ONE_MORE_BYTE (c);
4681 if (c < 0x40 || c == 0x7F || c > 0xFC)
4682 break;
4683 found = CATEGORY_MASK_SJIS;
4685 else if (c >= 0xA0 && c < 0xE0)
4686 found = CATEGORY_MASK_SJIS;
4687 else
4688 break;
4690 detect_info->rejected |= CATEGORY_MASK_SJIS;
4691 return 0;
4693 no_more_source:
4694 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4696 detect_info->rejected |= CATEGORY_MASK_SJIS;
4697 return 0;
4699 detect_info->found |= found;
4700 return 1;
4703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4704 Return true if a text is encoded in BIG5. */
4706 static bool
4707 detect_coding_big5 (struct coding_system *coding,
4708 struct coding_detection_info *detect_info)
4710 const unsigned char *src = coding->source, *src_base;
4711 const unsigned char *src_end = coding->source + coding->src_bytes;
4712 bool multibytep = coding->src_multibyte;
4713 ptrdiff_t consumed_chars = 0;
4714 int found = 0;
4715 int c;
4717 detect_info->checked |= CATEGORY_MASK_BIG5;
4718 /* A coding system of this category is always ASCII compatible. */
4719 src += coding->head_ascii;
4721 while (1)
4723 src_base = src;
4724 ONE_MORE_BYTE (c);
4725 if (c < 0x80)
4726 continue;
4727 if (c >= 0xA1)
4729 ONE_MORE_BYTE (c);
4730 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4731 return 0;
4732 found = CATEGORY_MASK_BIG5;
4734 else
4735 break;
4737 detect_info->rejected |= CATEGORY_MASK_BIG5;
4738 return 0;
4740 no_more_source:
4741 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4743 detect_info->rejected |= CATEGORY_MASK_BIG5;
4744 return 0;
4746 detect_info->found |= found;
4747 return 1;
4750 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4752 static void
4753 decode_coding_sjis (struct coding_system *coding)
4755 const unsigned char *src = coding->source + coding->consumed;
4756 const unsigned char *src_end = coding->source + coding->src_bytes;
4757 const unsigned char *src_base;
4758 int *charbuf = coding->charbuf + coding->charbuf_used;
4759 /* We may produce one charset annotation in one loop and one more at
4760 the end. */
4761 int *charbuf_end
4762 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4763 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4764 bool multibytep = coding->src_multibyte;
4765 struct charset *charset_roman, *charset_kanji, *charset_kana;
4766 struct charset *charset_kanji2;
4767 Lisp_Object attrs, charset_list, val;
4768 ptrdiff_t char_offset = coding->produced_char;
4769 ptrdiff_t last_offset = char_offset;
4770 int last_id = charset_ascii;
4771 bool eol_dos
4772 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4773 int byte_after_cr = -1;
4775 CODING_GET_INFO (coding, attrs, charset_list);
4777 val = charset_list;
4778 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4779 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4780 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4781 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4783 while (1)
4785 int c, c1;
4786 struct charset *charset;
4788 src_base = src;
4789 consumed_chars_base = consumed_chars;
4791 if (charbuf >= charbuf_end)
4793 if (byte_after_cr >= 0)
4794 src_base--;
4795 break;
4798 if (byte_after_cr >= 0)
4799 c = byte_after_cr, byte_after_cr = -1;
4800 else
4801 ONE_MORE_BYTE (c);
4802 if (c < 0)
4803 goto invalid_code;
4804 if (c < 0x80)
4806 if (eol_dos && c == '\r')
4807 ONE_MORE_BYTE (byte_after_cr);
4808 charset = charset_roman;
4810 else if (c == 0x80 || c == 0xA0)
4811 goto invalid_code;
4812 else if (c >= 0xA1 && c <= 0xDF)
4814 /* SJIS -> JISX0201-Kana */
4815 c &= 0x7F;
4816 charset = charset_kana;
4818 else if (c <= 0xEF)
4820 /* SJIS -> JISX0208 */
4821 ONE_MORE_BYTE (c1);
4822 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4823 goto invalid_code;
4824 c = (c << 8) | c1;
4825 SJIS_TO_JIS (c);
4826 charset = charset_kanji;
4828 else if (c <= 0xFC && charset_kanji2)
4830 /* SJIS -> JISX0213-2 */
4831 ONE_MORE_BYTE (c1);
4832 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4833 goto invalid_code;
4834 c = (c << 8) | c1;
4835 SJIS_TO_JIS2 (c);
4836 charset = charset_kanji2;
4838 else
4839 goto invalid_code;
4840 if (charset->id != charset_ascii
4841 && last_id != charset->id)
4843 if (last_id != charset_ascii)
4844 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845 last_id = charset->id;
4846 last_offset = char_offset;
4848 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4849 *charbuf++ = c;
4850 char_offset++;
4851 continue;
4853 invalid_code:
4854 src = src_base;
4855 consumed_chars = consumed_chars_base;
4856 ONE_MORE_BYTE (c);
4857 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4858 char_offset++;
4859 coding->errors++;
4862 no_more_source:
4863 if (last_id != charset_ascii)
4864 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4865 coding->consumed_char += consumed_chars_base;
4866 coding->consumed = src_base - coding->source;
4867 coding->charbuf_used = charbuf - coding->charbuf;
4870 static void
4871 decode_coding_big5 (struct coding_system *coding)
4873 const unsigned char *src = coding->source + coding->consumed;
4874 const unsigned char *src_end = coding->source + coding->src_bytes;
4875 const unsigned char *src_base;
4876 int *charbuf = coding->charbuf + coding->charbuf_used;
4877 /* We may produce one charset annotation in one loop and one more at
4878 the end. */
4879 int *charbuf_end
4880 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4881 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4882 bool multibytep = coding->src_multibyte;
4883 struct charset *charset_roman, *charset_big5;
4884 Lisp_Object attrs, charset_list, val;
4885 ptrdiff_t char_offset = coding->produced_char;
4886 ptrdiff_t last_offset = char_offset;
4887 int last_id = charset_ascii;
4888 bool eol_dos
4889 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4890 int byte_after_cr = -1;
4892 CODING_GET_INFO (coding, attrs, charset_list);
4893 val = charset_list;
4894 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4895 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4897 while (1)
4899 int c, c1;
4900 struct charset *charset;
4902 src_base = src;
4903 consumed_chars_base = consumed_chars;
4905 if (charbuf >= charbuf_end)
4907 if (byte_after_cr >= 0)
4908 src_base--;
4909 break;
4912 if (byte_after_cr >= 0)
4913 c = byte_after_cr, byte_after_cr = -1;
4914 else
4915 ONE_MORE_BYTE (c);
4917 if (c < 0)
4918 goto invalid_code;
4919 if (c < 0x80)
4921 if (eol_dos && c == '\r')
4922 ONE_MORE_BYTE (byte_after_cr);
4923 charset = charset_roman;
4925 else
4927 /* BIG5 -> Big5 */
4928 if (c < 0xA1 || c > 0xFE)
4929 goto invalid_code;
4930 ONE_MORE_BYTE (c1);
4931 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4932 goto invalid_code;
4933 c = c << 8 | c1;
4934 charset = charset_big5;
4936 if (charset->id != charset_ascii
4937 && last_id != charset->id)
4939 if (last_id != charset_ascii)
4940 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4941 last_id = charset->id;
4942 last_offset = char_offset;
4944 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4945 *charbuf++ = c;
4946 char_offset++;
4947 continue;
4949 invalid_code:
4950 src = src_base;
4951 consumed_chars = consumed_chars_base;
4952 ONE_MORE_BYTE (c);
4953 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4954 char_offset++;
4955 coding->errors++;
4958 no_more_source:
4959 if (last_id != charset_ascii)
4960 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4961 coding->consumed_char += consumed_chars_base;
4962 coding->consumed = src_base - coding->source;
4963 coding->charbuf_used = charbuf - coding->charbuf;
4966 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4967 This function can encode charsets `ascii', `katakana-jisx0201',
4968 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4969 are sure that all these charsets are registered as official charset
4970 (i.e. do not have extended leading-codes). Characters of other
4971 charsets are produced without any encoding. */
4973 static bool
4974 encode_coding_sjis (struct coding_system *coding)
4976 bool multibytep = coding->dst_multibyte;
4977 int *charbuf = coding->charbuf;
4978 int *charbuf_end = charbuf + coding->charbuf_used;
4979 unsigned char *dst = coding->destination + coding->produced;
4980 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4981 int safe_room = 4;
4982 ptrdiff_t produced_chars = 0;
4983 Lisp_Object attrs, charset_list, val;
4984 bool ascii_compatible;
4985 struct charset *charset_kanji, *charset_kana;
4986 struct charset *charset_kanji2;
4987 int c;
4989 CODING_GET_INFO (coding, attrs, charset_list);
4990 val = XCDR (charset_list);
4991 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4992 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4993 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4995 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4997 while (charbuf < charbuf_end)
4999 ASSURE_DESTINATION (safe_room);
5000 c = *charbuf++;
5001 /* Now encode the character C. */
5002 if (ASCII_CHAR_P (c) && ascii_compatible)
5003 EMIT_ONE_ASCII_BYTE (c);
5004 else if (CHAR_BYTE8_P (c))
5006 c = CHAR_TO_BYTE8 (c);
5007 EMIT_ONE_BYTE (c);
5009 else
5011 unsigned code;
5012 struct charset *charset;
5013 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5014 &code, charset);
5016 if (!charset)
5018 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5020 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5021 charset = CHARSET_FROM_ID (charset_ascii);
5023 else
5025 c = coding->default_char;
5026 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5027 charset_list, &code, charset);
5030 if (code == CHARSET_INVALID_CODE (charset))
5031 emacs_abort ();
5032 if (charset == charset_kanji)
5034 int c1, c2;
5035 JIS_TO_SJIS (code);
5036 c1 = code >> 8, c2 = code & 0xFF;
5037 EMIT_TWO_BYTES (c1, c2);
5039 else if (charset == charset_kana)
5040 EMIT_ONE_BYTE (code | 0x80);
5041 else if (charset_kanji2 && charset == charset_kanji2)
5043 int c1, c2;
5045 c1 = code >> 8;
5046 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5047 || c1 == 0x28
5048 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5050 JIS_TO_SJIS2 (code);
5051 c1 = code >> 8, c2 = code & 0xFF;
5052 EMIT_TWO_BYTES (c1, c2);
5054 else
5055 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5057 else
5058 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5061 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5062 coding->produced_char += produced_chars;
5063 coding->produced = dst - coding->destination;
5064 return 0;
5067 static bool
5068 encode_coding_big5 (struct coding_system *coding)
5070 bool multibytep = coding->dst_multibyte;
5071 int *charbuf = coding->charbuf;
5072 int *charbuf_end = charbuf + coding->charbuf_used;
5073 unsigned char *dst = coding->destination + coding->produced;
5074 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5075 int safe_room = 4;
5076 ptrdiff_t produced_chars = 0;
5077 Lisp_Object attrs, charset_list, val;
5078 bool ascii_compatible;
5079 struct charset *charset_big5;
5080 int c;
5082 CODING_GET_INFO (coding, attrs, charset_list);
5083 val = XCDR (charset_list);
5084 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5085 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5087 while (charbuf < charbuf_end)
5089 ASSURE_DESTINATION (safe_room);
5090 c = *charbuf++;
5091 /* Now encode the character C. */
5092 if (ASCII_CHAR_P (c) && ascii_compatible)
5093 EMIT_ONE_ASCII_BYTE (c);
5094 else if (CHAR_BYTE8_P (c))
5096 c = CHAR_TO_BYTE8 (c);
5097 EMIT_ONE_BYTE (c);
5099 else
5101 unsigned code;
5102 struct charset *charset;
5103 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5104 &code, charset);
5106 if (! charset)
5108 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5110 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5111 charset = CHARSET_FROM_ID (charset_ascii);
5113 else
5115 c = coding->default_char;
5116 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5117 charset_list, &code, charset);
5120 if (code == CHARSET_INVALID_CODE (charset))
5121 emacs_abort ();
5122 if (charset == charset_big5)
5124 int c1, c2;
5126 c1 = code >> 8, c2 = code & 0xFF;
5127 EMIT_TWO_BYTES (c1, c2);
5129 else
5130 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5133 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5134 coding->produced_char += produced_chars;
5135 coding->produced = dst - coding->destination;
5136 return 0;
5140 /*** 10. CCL handlers ***/
5142 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5143 Return true if a text is encoded in a coding system of which
5144 encoder/decoder are written in CCL program. */
5146 static bool
5147 detect_coding_ccl (struct coding_system *coding,
5148 struct coding_detection_info *detect_info)
5150 const unsigned char *src = coding->source, *src_base;
5151 const unsigned char *src_end = coding->source + coding->src_bytes;
5152 bool multibytep = coding->src_multibyte;
5153 ptrdiff_t consumed_chars = 0;
5154 int found = 0;
5155 unsigned char *valids;
5156 ptrdiff_t head_ascii = coding->head_ascii;
5157 Lisp_Object attrs;
5159 detect_info->checked |= CATEGORY_MASK_CCL;
5161 coding = &coding_categories[coding_category_ccl];
5162 valids = CODING_CCL_VALIDS (coding);
5163 attrs = CODING_ID_ATTRS (coding->id);
5164 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5165 src += head_ascii;
5167 while (1)
5169 int c;
5171 src_base = src;
5172 ONE_MORE_BYTE (c);
5173 if (c < 0 || ! valids[c])
5174 break;
5175 if ((valids[c] > 1))
5176 found = CATEGORY_MASK_CCL;
5178 detect_info->rejected |= CATEGORY_MASK_CCL;
5179 return 0;
5181 no_more_source:
5182 detect_info->found |= found;
5183 return 1;
5186 static void
5187 decode_coding_ccl (struct coding_system *coding)
5189 const unsigned char *src = coding->source + coding->consumed;
5190 const unsigned char *src_end = coding->source + coding->src_bytes;
5191 int *charbuf = coding->charbuf + coding->charbuf_used;
5192 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5193 ptrdiff_t consumed_chars = 0;
5194 bool multibytep = coding->src_multibyte;
5195 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5196 int source_charbuf[1024];
5197 int source_byteidx[1025];
5198 Lisp_Object attrs, charset_list;
5200 CODING_GET_INFO (coding, attrs, charset_list);
5202 while (1)
5204 const unsigned char *p = src;
5205 ptrdiff_t offset;
5206 int i = 0;
5208 if (multibytep)
5210 while (i < 1024 && p < src_end)
5212 source_byteidx[i] = p - src;
5213 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5215 source_byteidx[i] = p - src;
5217 else
5218 while (i < 1024 && p < src_end)
5219 source_charbuf[i++] = *p++;
5221 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5222 ccl->last_block = true;
5223 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5224 charset_map_loaded = 0;
5225 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5226 charset_list);
5227 if (charset_map_loaded
5228 && (offset = coding_change_source (coding)))
5230 p += offset;
5231 src += offset;
5232 src_end += offset;
5234 charbuf += ccl->produced;
5235 if (multibytep)
5236 src += source_byteidx[ccl->consumed];
5237 else
5238 src += ccl->consumed;
5239 consumed_chars += ccl->consumed;
5240 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5241 break;
5244 switch (ccl->status)
5246 case CCL_STAT_SUSPEND_BY_SRC:
5247 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5248 break;
5249 case CCL_STAT_SUSPEND_BY_DST:
5250 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5251 break;
5252 case CCL_STAT_QUIT:
5253 case CCL_STAT_INVALID_CMD:
5254 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5255 break;
5256 default:
5257 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5258 break;
5260 coding->consumed_char += consumed_chars;
5261 coding->consumed = src - coding->source;
5262 coding->charbuf_used = charbuf - coding->charbuf;
5265 static bool
5266 encode_coding_ccl (struct coding_system *coding)
5268 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5269 bool multibytep = coding->dst_multibyte;
5270 int *charbuf = coding->charbuf;
5271 int *charbuf_end = charbuf + coding->charbuf_used;
5272 unsigned char *dst = coding->destination + coding->produced;
5273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5274 int destination_charbuf[1024];
5275 ptrdiff_t produced_chars = 0;
5276 int i;
5277 Lisp_Object attrs, charset_list;
5279 CODING_GET_INFO (coding, attrs, charset_list);
5280 if (coding->consumed_char == coding->src_chars
5281 && coding->mode & CODING_MODE_LAST_BLOCK)
5282 ccl->last_block = true;
5286 ptrdiff_t offset;
5288 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5289 charset_map_loaded = 0;
5290 ccl_driver (ccl, charbuf, destination_charbuf,
5291 charbuf_end - charbuf, 1024, charset_list);
5292 if (charset_map_loaded
5293 && (offset = coding_change_destination (coding)))
5294 dst += offset;
5295 if (multibytep)
5297 ASSURE_DESTINATION (ccl->produced * 2);
5298 for (i = 0; i < ccl->produced; i++)
5299 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5301 else
5303 ASSURE_DESTINATION (ccl->produced);
5304 for (i = 0; i < ccl->produced; i++)
5305 *dst++ = destination_charbuf[i] & 0xFF;
5306 produced_chars += ccl->produced;
5308 charbuf += ccl->consumed;
5309 if (ccl->status == CCL_STAT_QUIT
5310 || ccl->status == CCL_STAT_INVALID_CMD)
5311 break;
5313 while (charbuf < charbuf_end);
5315 switch (ccl->status)
5317 case CCL_STAT_SUSPEND_BY_SRC:
5318 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5319 break;
5320 case CCL_STAT_SUSPEND_BY_DST:
5321 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5322 break;
5323 case CCL_STAT_QUIT:
5324 case CCL_STAT_INVALID_CMD:
5325 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5326 break;
5327 default:
5328 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5329 break;
5332 coding->produced_char += produced_chars;
5333 coding->produced = dst - coding->destination;
5334 return 0;
5338 /*** 10, 11. no-conversion handlers ***/
5340 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5342 static void
5343 decode_coding_raw_text (struct coding_system *coding)
5345 bool eol_dos
5346 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5348 coding->chars_at_source = 1;
5349 coding->consumed_char = coding->src_chars;
5350 coding->consumed = coding->src_bytes;
5351 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5353 coding->consumed_char--;
5354 coding->consumed--;
5355 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5357 else
5358 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5361 static bool
5362 encode_coding_raw_text (struct coding_system *coding)
5364 bool multibytep = coding->dst_multibyte;
5365 int *charbuf = coding->charbuf;
5366 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5367 unsigned char *dst = coding->destination + coding->produced;
5368 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5369 ptrdiff_t produced_chars = 0;
5370 int c;
5372 if (multibytep)
5374 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5376 if (coding->src_multibyte)
5377 while (charbuf < charbuf_end)
5379 ASSURE_DESTINATION (safe_room);
5380 c = *charbuf++;
5381 if (ASCII_CHAR_P (c))
5382 EMIT_ONE_ASCII_BYTE (c);
5383 else if (CHAR_BYTE8_P (c))
5385 c = CHAR_TO_BYTE8 (c);
5386 EMIT_ONE_BYTE (c);
5388 else
5390 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5392 CHAR_STRING_ADVANCE (c, p1);
5395 EMIT_ONE_BYTE (*p0);
5396 p0++;
5398 while (p0 < p1);
5401 else
5402 while (charbuf < charbuf_end)
5404 ASSURE_DESTINATION (safe_room);
5405 c = *charbuf++;
5406 EMIT_ONE_BYTE (c);
5409 else
5411 if (coding->src_multibyte)
5413 int safe_room = MAX_MULTIBYTE_LENGTH;
5415 while (charbuf < charbuf_end)
5417 ASSURE_DESTINATION (safe_room);
5418 c = *charbuf++;
5419 if (ASCII_CHAR_P (c))
5420 *dst++ = c;
5421 else if (CHAR_BYTE8_P (c))
5422 *dst++ = CHAR_TO_BYTE8 (c);
5423 else
5424 CHAR_STRING_ADVANCE (c, dst);
5427 else
5429 ASSURE_DESTINATION (charbuf_end - charbuf);
5430 while (charbuf < charbuf_end && dst < dst_end)
5431 *dst++ = *charbuf++;
5433 produced_chars = dst - (coding->destination + coding->produced);
5435 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5436 coding->produced_char += produced_chars;
5437 coding->produced = dst - coding->destination;
5438 return 0;
5441 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5442 Return true if a text is encoded in a charset-based coding system. */
5444 static bool
5445 detect_coding_charset (struct coding_system *coding,
5446 struct coding_detection_info *detect_info)
5448 const unsigned char *src = coding->source, *src_base;
5449 const unsigned char *src_end = coding->source + coding->src_bytes;
5450 bool multibytep = coding->src_multibyte;
5451 ptrdiff_t consumed_chars = 0;
5452 Lisp_Object attrs, valids, name;
5453 int found = 0;
5454 ptrdiff_t head_ascii = coding->head_ascii;
5455 bool check_latin_extra = 0;
5457 detect_info->checked |= CATEGORY_MASK_CHARSET;
5459 coding = &coding_categories[coding_category_charset];
5460 attrs = CODING_ID_ATTRS (coding->id);
5461 valids = AREF (attrs, coding_attr_charset_valids);
5462 name = CODING_ID_NAME (coding->id);
5463 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5464 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5465 || strncmp (SSDATA (SYMBOL_NAME (name)),
5466 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5467 check_latin_extra = 1;
5469 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5470 src += head_ascii;
5472 while (1)
5474 int c;
5475 Lisp_Object val;
5476 struct charset *charset;
5477 int dim, idx;
5479 src_base = src;
5480 ONE_MORE_BYTE (c);
5481 if (c < 0)
5482 continue;
5483 val = AREF (valids, c);
5484 if (NILP (val))
5485 break;
5486 if (c >= 0x80)
5488 if (c < 0xA0
5489 && check_latin_extra
5490 && (!VECTORP (Vlatin_extra_code_table)
5491 || NILP (AREF (Vlatin_extra_code_table, c))))
5492 break;
5493 found = CATEGORY_MASK_CHARSET;
5495 if (INTEGERP (val))
5497 charset = CHARSET_FROM_ID (XFASTINT (val));
5498 dim = CHARSET_DIMENSION (charset);
5499 for (idx = 1; idx < dim; idx++)
5501 if (src == src_end)
5502 goto too_short;
5503 ONE_MORE_BYTE (c);
5504 if (c < charset->code_space[(dim - 1 - idx) * 4]
5505 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506 break;
5508 if (idx < dim)
5509 break;
5511 else
5513 idx = 1;
5514 for (; CONSP (val); val = XCDR (val))
5516 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5517 dim = CHARSET_DIMENSION (charset);
5518 while (idx < dim)
5520 if (src == src_end)
5521 goto too_short;
5522 ONE_MORE_BYTE (c);
5523 if (c < charset->code_space[(dim - 1 - idx) * 4]
5524 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5525 break;
5526 idx++;
5528 if (idx == dim)
5530 val = Qnil;
5531 break;
5534 if (CONSP (val))
5535 break;
5538 too_short:
5539 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5540 return 0;
5542 no_more_source:
5543 detect_info->found |= found;
5544 return 1;
5547 static void
5548 decode_coding_charset (struct coding_system *coding)
5550 const unsigned char *src = coding->source + coding->consumed;
5551 const unsigned char *src_end = coding->source + coding->src_bytes;
5552 const unsigned char *src_base;
5553 int *charbuf = coding->charbuf + coding->charbuf_used;
5554 /* We may produce one charset annotation in one loop and one more at
5555 the end. */
5556 int *charbuf_end
5557 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5558 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5559 bool multibytep = coding->src_multibyte;
5560 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5561 Lisp_Object valids;
5562 ptrdiff_t char_offset = coding->produced_char;
5563 ptrdiff_t last_offset = char_offset;
5564 int last_id = charset_ascii;
5565 bool eol_dos
5566 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5567 int byte_after_cr = -1;
5569 valids = AREF (attrs, coding_attr_charset_valids);
5571 while (1)
5573 int c;
5574 Lisp_Object val;
5575 struct charset *charset;
5576 int dim;
5577 int len = 1;
5578 unsigned code;
5580 src_base = src;
5581 consumed_chars_base = consumed_chars;
5583 if (charbuf >= charbuf_end)
5585 if (byte_after_cr >= 0)
5586 src_base--;
5587 break;
5590 if (byte_after_cr >= 0)
5592 c = byte_after_cr;
5593 byte_after_cr = -1;
5595 else
5597 ONE_MORE_BYTE (c);
5598 if (eol_dos && c == '\r')
5599 ONE_MORE_BYTE (byte_after_cr);
5601 if (c < 0)
5602 goto invalid_code;
5603 code = c;
5605 val = AREF (valids, c);
5606 if (! INTEGERP (val) && ! CONSP (val))
5607 goto invalid_code;
5608 if (INTEGERP (val))
5610 charset = CHARSET_FROM_ID (XFASTINT (val));
5611 dim = CHARSET_DIMENSION (charset);
5612 while (len < dim)
5614 ONE_MORE_BYTE (c);
5615 code = (code << 8) | c;
5616 len++;
5618 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5619 charset, code, c);
5621 else
5623 /* VAL is a list of charset IDs. It is assured that the
5624 list is sorted by charset dimensions (smaller one
5625 comes first). */
5626 while (CONSP (val))
5628 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5629 dim = CHARSET_DIMENSION (charset);
5630 while (len < dim)
5632 ONE_MORE_BYTE (c);
5633 code = (code << 8) | c;
5634 len++;
5636 CODING_DECODE_CHAR (coding, src, src_base,
5637 src_end, charset, code, c);
5638 if (c >= 0)
5639 break;
5640 val = XCDR (val);
5643 if (c < 0)
5644 goto invalid_code;
5645 if (charset->id != charset_ascii
5646 && last_id != charset->id)
5648 if (last_id != charset_ascii)
5649 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5650 last_id = charset->id;
5651 last_offset = char_offset;
5654 *charbuf++ = c;
5655 char_offset++;
5656 continue;
5658 invalid_code:
5659 src = src_base;
5660 consumed_chars = consumed_chars_base;
5661 ONE_MORE_BYTE (c);
5662 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5663 char_offset++;
5664 coding->errors++;
5667 no_more_source:
5668 if (last_id != charset_ascii)
5669 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5670 coding->consumed_char += consumed_chars_base;
5671 coding->consumed = src_base - coding->source;
5672 coding->charbuf_used = charbuf - coding->charbuf;
5675 static bool
5676 encode_coding_charset (struct coding_system *coding)
5678 bool multibytep = coding->dst_multibyte;
5679 int *charbuf = coding->charbuf;
5680 int *charbuf_end = charbuf + coding->charbuf_used;
5681 unsigned char *dst = coding->destination + coding->produced;
5682 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5683 int safe_room = MAX_MULTIBYTE_LENGTH;
5684 ptrdiff_t produced_chars = 0;
5685 Lisp_Object attrs, charset_list;
5686 bool ascii_compatible;
5687 int c;
5689 CODING_GET_INFO (coding, attrs, charset_list);
5690 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5692 while (charbuf < charbuf_end)
5694 struct charset *charset;
5695 unsigned code;
5697 ASSURE_DESTINATION (safe_room);
5698 c = *charbuf++;
5699 if (ascii_compatible && ASCII_CHAR_P (c))
5700 EMIT_ONE_ASCII_BYTE (c);
5701 else if (CHAR_BYTE8_P (c))
5703 c = CHAR_TO_BYTE8 (c);
5704 EMIT_ONE_BYTE (c);
5706 else
5708 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5709 &code, charset);
5711 if (charset)
5713 if (CHARSET_DIMENSION (charset) == 1)
5714 EMIT_ONE_BYTE (code);
5715 else if (CHARSET_DIMENSION (charset) == 2)
5716 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5717 else if (CHARSET_DIMENSION (charset) == 3)
5718 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5719 else
5720 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5721 (code >> 8) & 0xFF, code & 0xFF);
5723 else
5725 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5726 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5727 else
5728 c = coding->default_char;
5729 EMIT_ONE_BYTE (c);
5734 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5735 coding->produced_char += produced_chars;
5736 coding->produced = dst - coding->destination;
5737 return 0;
5741 /*** 7. C library functions ***/
5743 /* Setup coding context CODING from information about CODING_SYSTEM.
5744 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5745 CODING_SYSTEM is invalid, signal an error. */
5747 void
5748 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5750 Lisp_Object attrs;
5751 Lisp_Object eol_type;
5752 Lisp_Object coding_type;
5753 Lisp_Object val;
5755 if (NILP (coding_system))
5756 coding_system = Qundecided;
5758 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5760 attrs = CODING_ID_ATTRS (coding->id);
5761 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5763 coding->mode = 0;
5764 if (VECTORP (eol_type))
5765 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5766 | CODING_REQUIRE_DETECTION_MASK);
5767 else if (! EQ (eol_type, Qunix))
5768 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5769 | CODING_REQUIRE_ENCODING_MASK);
5770 else
5771 coding->common_flags = 0;
5772 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5773 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5774 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5775 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5776 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5777 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5779 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5780 coding->max_charset_id = SCHARS (val) - 1;
5781 coding->safe_charsets = SDATA (val);
5782 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5783 coding->carryover_bytes = 0;
5784 coding->raw_destination = 0;
5786 coding_type = CODING_ATTR_TYPE (attrs);
5787 if (EQ (coding_type, Qundecided))
5789 coding->detector = NULL;
5790 coding->decoder = decode_coding_raw_text;
5791 coding->encoder = encode_coding_raw_text;
5792 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5793 coding->spec.undecided.inhibit_nbd
5794 = (encode_inhibit_flag
5795 (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5796 coding->spec.undecided.inhibit_ied
5797 = (encode_inhibit_flag
5798 (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5799 coding->spec.undecided.prefer_utf_8
5800 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5802 else if (EQ (coding_type, Qiso_2022))
5804 int i;
5805 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5807 /* Invoke graphic register 0 to plane 0. */
5808 CODING_ISO_INVOCATION (coding, 0) = 0;
5809 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5810 CODING_ISO_INVOCATION (coding, 1)
5811 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5812 /* Setup the initial status of designation. */
5813 for (i = 0; i < 4; i++)
5814 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5815 /* Not single shifting initially. */
5816 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5817 /* Beginning of buffer should also be regarded as bol. */
5818 CODING_ISO_BOL (coding) = 1;
5819 coding->detector = detect_coding_iso_2022;
5820 coding->decoder = decode_coding_iso_2022;
5821 coding->encoder = encode_coding_iso_2022;
5822 if (flags & CODING_ISO_FLAG_SAFE)
5823 coding->mode |= CODING_MODE_SAFE_ENCODING;
5824 coding->common_flags
5825 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5826 | CODING_REQUIRE_FLUSHING_MASK);
5827 if (flags & CODING_ISO_FLAG_COMPOSITION)
5828 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5829 if (flags & CODING_ISO_FLAG_DESIGNATION)
5830 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5831 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5833 setup_iso_safe_charsets (attrs);
5834 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5835 coding->max_charset_id = SCHARS (val) - 1;
5836 coding->safe_charsets = SDATA (val);
5838 CODING_ISO_FLAGS (coding) = flags;
5839 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5840 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5841 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5842 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5844 else if (EQ (coding_type, Qcharset))
5846 coding->detector = detect_coding_charset;
5847 coding->decoder = decode_coding_charset;
5848 coding->encoder = encode_coding_charset;
5849 coding->common_flags
5850 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5852 else if (EQ (coding_type, Qutf_8))
5854 val = AREF (attrs, coding_attr_utf_bom);
5855 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5856 : EQ (val, Qt) ? utf_with_bom
5857 : utf_without_bom);
5858 coding->detector = detect_coding_utf_8;
5859 coding->decoder = decode_coding_utf_8;
5860 coding->encoder = encode_coding_utf_8;
5861 coding->common_flags
5862 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5863 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5864 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5866 else if (EQ (coding_type, Qutf_16))
5868 val = AREF (attrs, coding_attr_utf_bom);
5869 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5870 : EQ (val, Qt) ? utf_with_bom
5871 : utf_without_bom);
5872 val = AREF (attrs, coding_attr_utf_16_endian);
5873 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5874 : utf_16_little_endian);
5875 CODING_UTF_16_SURROGATE (coding) = 0;
5876 coding->detector = detect_coding_utf_16;
5877 coding->decoder = decode_coding_utf_16;
5878 coding->encoder = encode_coding_utf_16;
5879 coding->common_flags
5880 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5882 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5884 else if (EQ (coding_type, Qccl))
5886 coding->detector = detect_coding_ccl;
5887 coding->decoder = decode_coding_ccl;
5888 coding->encoder = encode_coding_ccl;
5889 coding->common_flags
5890 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5891 | CODING_REQUIRE_FLUSHING_MASK);
5893 else if (EQ (coding_type, Qemacs_mule))
5895 coding->detector = detect_coding_emacs_mule;
5896 coding->decoder = decode_coding_emacs_mule;
5897 coding->encoder = encode_coding_emacs_mule;
5898 coding->common_flags
5899 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5900 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5901 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5903 Lisp_Object tail, safe_charsets;
5904 int max_charset_id = 0;
5906 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5907 tail = XCDR (tail))
5908 if (max_charset_id < XFASTINT (XCAR (tail)))
5909 max_charset_id = XFASTINT (XCAR (tail));
5910 safe_charsets = make_uninit_string (max_charset_id + 1);
5911 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5912 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5913 tail = XCDR (tail))
5914 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5915 coding->max_charset_id = max_charset_id;
5916 coding->safe_charsets = SDATA (safe_charsets);
5918 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5919 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5921 else if (EQ (coding_type, Qshift_jis))
5923 coding->detector = detect_coding_sjis;
5924 coding->decoder = decode_coding_sjis;
5925 coding->encoder = encode_coding_sjis;
5926 coding->common_flags
5927 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5929 else if (EQ (coding_type, Qbig5))
5931 coding->detector = detect_coding_big5;
5932 coding->decoder = decode_coding_big5;
5933 coding->encoder = encode_coding_big5;
5934 coding->common_flags
5935 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5937 else /* EQ (coding_type, Qraw_text) */
5939 coding->detector = NULL;
5940 coding->decoder = decode_coding_raw_text;
5941 coding->encoder = encode_coding_raw_text;
5942 if (! EQ (eol_type, Qunix))
5944 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5945 if (! VECTORP (eol_type))
5946 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5951 return;
5954 /* Return a list of charsets supported by CODING. */
5956 Lisp_Object
5957 coding_charset_list (struct coding_system *coding)
5959 Lisp_Object attrs, charset_list;
5961 CODING_GET_INFO (coding, attrs, charset_list);
5962 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5964 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5966 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5967 charset_list = Viso_2022_charset_list;
5969 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5971 charset_list = Vemacs_mule_charset_list;
5973 return charset_list;
5977 /* Return a list of charsets supported by CODING-SYSTEM. */
5979 Lisp_Object
5980 coding_system_charset_list (Lisp_Object coding_system)
5982 ptrdiff_t id;
5983 Lisp_Object attrs, charset_list;
5985 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5986 attrs = CODING_ID_ATTRS (id);
5988 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5990 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5992 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5993 charset_list = Viso_2022_charset_list;
5994 else
5995 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5997 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5999 charset_list = Vemacs_mule_charset_list;
6001 else
6003 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6005 return charset_list;
6009 /* Return raw-text or one of its subsidiaries that has the same
6010 eol_type as CODING-SYSTEM. */
6012 Lisp_Object
6013 raw_text_coding_system (Lisp_Object coding_system)
6015 Lisp_Object spec, attrs;
6016 Lisp_Object eol_type, raw_text_eol_type;
6018 if (NILP (coding_system))
6019 return Qraw_text;
6020 spec = CODING_SYSTEM_SPEC (coding_system);
6021 attrs = AREF (spec, 0);
6023 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6024 return coding_system;
6026 eol_type = AREF (spec, 2);
6027 if (VECTORP (eol_type))
6028 return Qraw_text;
6029 spec = CODING_SYSTEM_SPEC (Qraw_text);
6030 raw_text_eol_type = AREF (spec, 2);
6031 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6032 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6033 : AREF (raw_text_eol_type, 2));
6037 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6038 the subsidiary that has the same eol-spec as PARENT (if it is not
6039 nil and specifies end-of-line format) or the system's setting
6040 (system_eol_type). */
6042 Lisp_Object
6043 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6045 Lisp_Object spec, eol_type;
6047 if (NILP (coding_system))
6048 coding_system = Qraw_text;
6049 spec = CODING_SYSTEM_SPEC (coding_system);
6050 eol_type = AREF (spec, 2);
6051 if (VECTORP (eol_type))
6053 Lisp_Object parent_eol_type;
6055 if (! NILP (parent))
6057 Lisp_Object parent_spec;
6059 parent_spec = CODING_SYSTEM_SPEC (parent);
6060 parent_eol_type = AREF (parent_spec, 2);
6061 if (VECTORP (parent_eol_type))
6062 parent_eol_type = system_eol_type;
6064 else
6065 parent_eol_type = system_eol_type;
6066 if (EQ (parent_eol_type, Qunix))
6067 coding_system = AREF (eol_type, 0);
6068 else if (EQ (parent_eol_type, Qdos))
6069 coding_system = AREF (eol_type, 1);
6070 else if (EQ (parent_eol_type, Qmac))
6071 coding_system = AREF (eol_type, 2);
6073 return coding_system;
6077 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6078 decided for writing to a process. If not, complement them, and
6079 return a new coding system. */
6081 Lisp_Object
6082 complement_process_encoding_system (Lisp_Object coding_system)
6084 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6085 Lisp_Object spec, attrs;
6086 int i;
6088 for (i = 0; i < 3; i++)
6090 if (i == 1)
6091 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6092 else if (i == 2)
6093 coding_system = preferred_coding_system ();
6094 spec = CODING_SYSTEM_SPEC (coding_system);
6095 if (NILP (spec))
6096 continue;
6097 attrs = AREF (spec, 0);
6098 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6099 coding_base = CODING_ATTR_BASE_NAME (attrs);
6100 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6101 eol_base = coding_system;
6102 if (! NILP (coding_base) && ! NILP (eol_base))
6103 break;
6106 if (i > 0)
6107 /* The original CODING_SYSTEM didn't specify text-conversion or
6108 eol-conversion. Be sure that we return a fully complemented
6109 coding system. */
6110 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6111 return coding_system;
6115 /* Emacs has a mechanism to automatically detect a coding system if it
6116 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6117 it's impossible to distinguish some coding systems accurately
6118 because they use the same range of codes. So, at first, coding
6119 systems are categorized into 7, those are:
6121 o coding-category-emacs-mule
6123 The category for a coding system which has the same code range
6124 as Emacs' internal format. Assigned the coding-system (Lisp
6125 symbol) `emacs-mule' by default.
6127 o coding-category-sjis
6129 The category for a coding system which has the same code range
6130 as SJIS. Assigned the coding-system (Lisp
6131 symbol) `japanese-shift-jis' by default.
6133 o coding-category-iso-7
6135 The category for a coding system which has the same code range
6136 as ISO2022 of 7-bit environment. This doesn't use any locking
6137 shift and single shift functions. This can encode/decode all
6138 charsets. Assigned the coding-system (Lisp symbol)
6139 `iso-2022-7bit' by default.
6141 o coding-category-iso-7-tight
6143 Same as coding-category-iso-7 except that this can
6144 encode/decode only the specified charsets.
6146 o coding-category-iso-8-1
6148 The category for a coding system which has the same code range
6149 as ISO2022 of 8-bit environment and graphic plane 1 used only
6150 for DIMENSION1 charset. This doesn't use any locking shift
6151 and single shift functions. Assigned the coding-system (Lisp
6152 symbol) `iso-latin-1' by default.
6154 o coding-category-iso-8-2
6156 The category for a coding system which has the same code range
6157 as ISO2022 of 8-bit environment and graphic plane 1 used only
6158 for DIMENSION2 charset. This doesn't use any locking shift
6159 and single shift functions. Assigned the coding-system (Lisp
6160 symbol) `japanese-iso-8bit' by default.
6162 o coding-category-iso-7-else
6164 The category for a coding system which has the same code range
6165 as ISO2022 of 7-bit environment but uses locking shift or
6166 single shift functions. Assigned the coding-system (Lisp
6167 symbol) `iso-2022-7bit-lock' by default.
6169 o coding-category-iso-8-else
6171 The category for a coding system which has the same code range
6172 as ISO2022 of 8-bit environment but uses locking shift or
6173 single shift functions. Assigned the coding-system (Lisp
6174 symbol) `iso-2022-8bit-ss2' by default.
6176 o coding-category-big5
6178 The category for a coding system which has the same code range
6179 as BIG5. Assigned the coding-system (Lisp symbol)
6180 `cn-big5' by default.
6182 o coding-category-utf-8
6184 The category for a coding system which has the same code range
6185 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6186 symbol) `utf-8' by default.
6188 o coding-category-utf-16-be
6190 The category for a coding system in which a text has an
6191 Unicode signature (cf. Unicode Standard) in the order of BIG
6192 endian at the head. Assigned the coding-system (Lisp symbol)
6193 `utf-16-be' by default.
6195 o coding-category-utf-16-le
6197 The category for a coding system in which a text has an
6198 Unicode signature (cf. Unicode Standard) in the order of
6199 LITTLE endian at the head. Assigned the coding-system (Lisp
6200 symbol) `utf-16-le' by default.
6202 o coding-category-ccl
6204 The category for a coding system of which encoder/decoder is
6205 written in CCL programs. The default value is nil, i.e., no
6206 coding system is assigned.
6208 o coding-category-binary
6210 The category for a coding system not categorized in any of the
6211 above. Assigned the coding-system (Lisp symbol)
6212 `no-conversion' by default.
6214 Each of them is a Lisp symbol and the value is an actual
6215 `coding-system's (this is also a Lisp symbol) assigned by a user.
6216 What Emacs does actually is to detect a category of coding system.
6217 Then, it uses a `coding-system' assigned to it. If Emacs can't
6218 decide only one possible category, it selects a category of the
6219 highest priority. Priorities of categories are also specified by a
6220 user in a Lisp variable `coding-category-list'.
6224 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6225 int eol_seen);
6228 /* Return the number of ASCII characters at the head of the source.
6229 By side effects, set coding->head_ascii and update
6230 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6231 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6232 reliable only when all the source bytes are ASCII. */
6234 static ptrdiff_t
6235 check_ascii (struct coding_system *coding)
6237 const unsigned char *src, *end;
6238 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6239 int eol_seen = coding->eol_seen;
6241 coding_set_source (coding);
6242 src = coding->source;
6243 end = src + coding->src_bytes;
6245 if (inhibit_eol_conversion
6246 || SYMBOLP (eol_type))
6248 /* We don't have to check EOL format. */
6249 while (src < end && !( *src & 0x80))
6251 if (*src++ == '\n')
6252 eol_seen |= EOL_SEEN_LF;
6255 else
6257 end--; /* We look ahead one byte for "CR LF". */
6258 while (src < end)
6260 int c = *src;
6262 if (c & 0x80)
6263 break;
6264 src++;
6265 if (c == '\r')
6267 if (*src == '\n')
6269 eol_seen |= EOL_SEEN_CRLF;
6270 src++;
6272 else
6273 eol_seen |= EOL_SEEN_CR;
6275 else if (c == '\n')
6276 eol_seen |= EOL_SEEN_LF;
6278 if (src == end)
6280 int c = *src;
6282 /* All bytes but the last one C are ASCII. */
6283 if (! (c & 0x80))
6285 if (c == '\r')
6286 eol_seen |= EOL_SEEN_CR;
6287 else if (c == '\n')
6288 eol_seen |= EOL_SEEN_LF;
6289 src++;
6293 coding->head_ascii = src - coding->source;
6294 coding->eol_seen = eol_seen;
6295 return (coding->head_ascii);
6299 /* Return the number of characters at the source if all the bytes are
6300 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6301 effects, update coding->eol_seen. The value of coding->eol_seen is
6302 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6303 the value is reliable only when all the source bytes are valid
6304 UTF-8. */
6306 static ptrdiff_t
6307 check_utf_8 (struct coding_system *coding)
6309 const unsigned char *src, *end;
6310 int eol_seen;
6311 ptrdiff_t nchars = coding->head_ascii;
6313 if (coding->head_ascii < 0)
6314 check_ascii (coding);
6315 else
6316 coding_set_source (coding);
6317 src = coding->source + coding->head_ascii;
6318 /* We look ahead one byte for CR LF. */
6319 end = coding->source + coding->src_bytes - 1;
6320 eol_seen = coding->eol_seen;
6321 while (src < end)
6323 int c = *src;
6325 if (UTF_8_1_OCTET_P (*src))
6327 src++;
6328 if (c < 0x20)
6330 if (c == '\r')
6332 if (*src == '\n')
6334 eol_seen |= EOL_SEEN_CRLF;
6335 src++;
6336 nchars++;
6338 else
6339 eol_seen |= EOL_SEEN_CR;
6341 else if (c == '\n')
6342 eol_seen |= EOL_SEEN_LF;
6345 else if (UTF_8_2_OCTET_LEADING_P (c))
6347 if (c < 0xC2 /* overlong sequence */
6348 || src + 1 >= end
6349 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6350 return -1;
6351 src += 2;
6353 else if (UTF_8_3_OCTET_LEADING_P (c))
6355 if (src + 2 >= end
6356 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6357 && UTF_8_EXTRA_OCTET_P (src[2])))
6358 return -1;
6359 c = (((c & 0xF) << 12)
6360 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6361 if (c < 0x800 /* overlong sequence */
6362 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6363 return -1;
6364 src += 3;
6366 else if (UTF_8_4_OCTET_LEADING_P (c))
6368 if (src + 3 >= end
6369 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6370 && UTF_8_EXTRA_OCTET_P (src[2])
6371 && UTF_8_EXTRA_OCTET_P (src[3])))
6372 return -1;
6373 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6374 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6375 if (c < 0x10000 /* overlong sequence */
6376 || c >= 0x110000) /* non-Unicode character */
6377 return -1;
6378 src += 4;
6380 else
6381 return -1;
6382 nchars++;
6385 if (src == end)
6387 if (! UTF_8_1_OCTET_P (*src))
6388 return -1;
6389 nchars++;
6390 if (*src == '\r')
6391 eol_seen |= EOL_SEEN_CR;
6392 else if (*src == '\n')
6393 eol_seen |= EOL_SEEN_LF;
6395 coding->eol_seen = eol_seen;
6396 return nchars;
6400 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6401 SOURCE is encoded. If CATEGORY is one of
6402 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6403 two-byte, else they are encoded by one-byte.
6405 Return one of EOL_SEEN_XXX. */
6407 #define MAX_EOL_CHECK_COUNT 3
6409 static int
6410 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6411 enum coding_category category)
6413 const unsigned char *src = source, *src_end = src + src_bytes;
6414 unsigned char c;
6415 int total = 0;
6416 int eol_seen = EOL_SEEN_NONE;
6418 if ((1 << category) & CATEGORY_MASK_UTF_16)
6420 bool msb = category == (coding_category_utf_16_le
6421 | coding_category_utf_16_le_nosig);
6422 bool lsb = !msb;
6424 while (src + 1 < src_end)
6426 c = src[lsb];
6427 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6429 int this_eol;
6431 if (c == '\n')
6432 this_eol = EOL_SEEN_LF;
6433 else if (src + 3 >= src_end
6434 || src[msb + 2] != 0
6435 || src[lsb + 2] != '\n')
6436 this_eol = EOL_SEEN_CR;
6437 else
6439 this_eol = EOL_SEEN_CRLF;
6440 src += 2;
6443 if (eol_seen == EOL_SEEN_NONE)
6444 /* This is the first end-of-line. */
6445 eol_seen = this_eol;
6446 else if (eol_seen != this_eol)
6448 /* The found type is different from what found before.
6449 Allow for stray ^M characters in DOS EOL files. */
6450 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6451 || (eol_seen == EOL_SEEN_CRLF
6452 && this_eol == EOL_SEEN_CR))
6453 eol_seen = EOL_SEEN_CRLF;
6454 else
6456 eol_seen = EOL_SEEN_LF;
6457 break;
6460 if (++total == MAX_EOL_CHECK_COUNT)
6461 break;
6463 src += 2;
6466 else
6467 while (src < src_end)
6469 c = *src++;
6470 if (c == '\n' || c == '\r')
6472 int this_eol;
6474 if (c == '\n')
6475 this_eol = EOL_SEEN_LF;
6476 else if (src >= src_end || *src != '\n')
6477 this_eol = EOL_SEEN_CR;
6478 else
6479 this_eol = EOL_SEEN_CRLF, src++;
6481 if (eol_seen == EOL_SEEN_NONE)
6482 /* This is the first end-of-line. */
6483 eol_seen = this_eol;
6484 else if (eol_seen != this_eol)
6486 /* The found type is different from what found before.
6487 Allow for stray ^M characters in DOS EOL files. */
6488 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6489 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6490 eol_seen = EOL_SEEN_CRLF;
6491 else
6493 eol_seen = EOL_SEEN_LF;
6494 break;
6497 if (++total == MAX_EOL_CHECK_COUNT)
6498 break;
6501 return eol_seen;
6505 static Lisp_Object
6506 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6508 Lisp_Object eol_type;
6510 eol_type = CODING_ID_EOL_TYPE (coding->id);
6511 if (! VECTORP (eol_type))
6512 /* Already adjusted. */
6513 return eol_type;
6514 if (eol_seen & EOL_SEEN_LF)
6516 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6517 eol_type = Qunix;
6519 else if (eol_seen & EOL_SEEN_CRLF)
6521 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6522 eol_type = Qdos;
6524 else if (eol_seen & EOL_SEEN_CR)
6526 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6527 eol_type = Qmac;
6529 return eol_type;
6532 /* Detect how a text specified in CODING is encoded. If a coding
6533 system is detected, update fields of CODING by the detected coding
6534 system. */
6536 static void
6537 detect_coding (struct coding_system *coding)
6539 const unsigned char *src, *src_end;
6540 unsigned int saved_mode = coding->mode;
6541 Lisp_Object found = Qnil;
6542 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6544 coding->consumed = coding->consumed_char = 0;
6545 coding->produced = coding->produced_char = 0;
6546 coding_set_source (coding);
6548 src_end = coding->source + coding->src_bytes;
6550 coding->eol_seen = EOL_SEEN_NONE;
6551 /* If we have not yet decided the text encoding type, detect it
6552 now. */
6553 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6555 int c, i;
6556 struct coding_detection_info detect_info;
6557 bool null_byte_found = 0, eight_bit_found = 0;
6558 bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6559 inhibit_null_byte_detection);
6560 bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6561 inhibit_iso_escape_detection);
6562 bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6564 coding->head_ascii = 0;
6565 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6566 for (src = coding->source; src < src_end; src++)
6568 c = *src;
6569 if (c & 0x80)
6571 eight_bit_found = 1;
6572 if (null_byte_found)
6573 break;
6575 else if (c < 0x20)
6577 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6578 && ! inhibit_ied
6579 && ! detect_info.checked)
6581 if (detect_coding_iso_2022 (coding, &detect_info))
6583 /* We have scanned the whole data. */
6584 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6586 /* We didn't find an 8-bit code. We may
6587 have found a null-byte, but it's very
6588 rare that a binary file conforms to
6589 ISO-2022. */
6590 src = src_end;
6591 coding->head_ascii = src - coding->source;
6593 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6594 break;
6597 else if (! c && !inhibit_nbd)
6599 null_byte_found = 1;
6600 if (eight_bit_found)
6601 break;
6603 else if (! disable_ascii_optimization
6604 && ! inhibit_eol_conversion)
6606 if (c == '\r')
6608 if (src < src_end && src[1] == '\n')
6610 coding->eol_seen |= EOL_SEEN_CRLF;
6611 src++;
6612 if (! eight_bit_found)
6613 coding->head_ascii++;
6615 else
6616 coding->eol_seen |= EOL_SEEN_CR;
6618 else if (c == '\n')
6620 coding->eol_seen |= EOL_SEEN_LF;
6624 if (! eight_bit_found)
6625 coding->head_ascii++;
6627 else if (! eight_bit_found)
6628 coding->head_ascii++;
6631 if (null_byte_found || eight_bit_found
6632 || coding->head_ascii < coding->src_bytes
6633 || detect_info.found)
6635 enum coding_category category;
6636 struct coding_system *this;
6638 if (coding->head_ascii == coding->src_bytes)
6639 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6640 for (i = 0; i < coding_category_raw_text; i++)
6642 category = coding_priorities[i];
6643 this = coding_categories + category;
6644 if (detect_info.found & (1 << category))
6645 break;
6647 else
6649 if (null_byte_found)
6651 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6652 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6654 else if (prefer_utf_8
6655 && detect_coding_utf_8 (coding, &detect_info))
6657 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6658 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6660 for (i = 0; i < coding_category_raw_text; i++)
6662 category = coding_priorities[i];
6663 this = coding_categories + category;
6664 /* Some of this->detector (e.g. detect_coding_sjis)
6665 require this information. */
6666 coding->id = this->id;
6667 if (this->id < 0)
6669 /* No coding system of this category is defined. */
6670 detect_info.rejected |= (1 << category);
6672 else if (category >= coding_category_raw_text)
6673 continue;
6674 else if (detect_info.checked & (1 << category))
6676 if (detect_info.found & (1 << category))
6677 break;
6679 else if ((*(this->detector)) (coding, &detect_info)
6680 && detect_info.found & (1 << category))
6681 break;
6685 if (i < coding_category_raw_text)
6687 if (category == coding_category_utf_8_auto)
6689 Lisp_Object coding_systems;
6691 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6692 coding_attr_utf_bom);
6693 if (CONSP (coding_systems))
6695 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6696 found = XCAR (coding_systems);
6697 else
6698 found = XCDR (coding_systems);
6700 else
6701 found = CODING_ID_NAME (this->id);
6703 else if (category == coding_category_utf_16_auto)
6705 Lisp_Object coding_systems;
6707 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6708 coding_attr_utf_bom);
6709 if (CONSP (coding_systems))
6711 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6712 found = XCAR (coding_systems);
6713 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6714 found = XCDR (coding_systems);
6716 else
6717 found = CODING_ID_NAME (this->id);
6719 else
6720 found = CODING_ID_NAME (this->id);
6722 else if (null_byte_found)
6723 found = Qno_conversion;
6724 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6725 == CATEGORY_MASK_ANY)
6726 found = Qraw_text;
6727 else if (detect_info.rejected)
6728 for (i = 0; i < coding_category_raw_text; i++)
6729 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6731 this = coding_categories + coding_priorities[i];
6732 found = CODING_ID_NAME (this->id);
6733 break;
6737 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6738 == coding_category_utf_8_auto)
6740 Lisp_Object coding_systems;
6741 struct coding_detection_info detect_info;
6743 coding_systems
6744 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6745 detect_info.found = detect_info.rejected = 0;
6746 if (check_ascii (coding) == coding->src_bytes)
6748 if (CONSP (coding_systems))
6749 found = XCDR (coding_systems);
6751 else
6753 if (CONSP (coding_systems)
6754 && detect_coding_utf_8 (coding, &detect_info))
6756 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6757 found = XCAR (coding_systems);
6758 else
6759 found = XCDR (coding_systems);
6763 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6764 == coding_category_utf_16_auto)
6766 Lisp_Object coding_systems;
6767 struct coding_detection_info detect_info;
6769 coding_systems
6770 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6771 detect_info.found = detect_info.rejected = 0;
6772 coding->head_ascii = 0;
6773 if (CONSP (coding_systems)
6774 && detect_coding_utf_16 (coding, &detect_info))
6776 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6777 found = XCAR (coding_systems);
6778 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6779 found = XCDR (coding_systems);
6783 if (! NILP (found))
6785 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6786 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6787 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6788 : EOL_SEEN_LF);
6790 setup_coding_system (found, coding);
6791 if (specified_eol != EOL_SEEN_NONE)
6792 adjust_coding_eol_type (coding, specified_eol);
6795 coding->mode = saved_mode;
6799 static void
6800 decode_eol (struct coding_system *coding)
6802 Lisp_Object eol_type;
6803 unsigned char *p, *pbeg, *pend;
6805 eol_type = CODING_ID_EOL_TYPE (coding->id);
6806 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6807 return;
6809 if (NILP (coding->dst_object))
6810 pbeg = coding->destination;
6811 else
6812 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6813 pend = pbeg + coding->produced;
6815 if (VECTORP (eol_type))
6817 int eol_seen = EOL_SEEN_NONE;
6819 for (p = pbeg; p < pend; p++)
6821 if (*p == '\n')
6822 eol_seen |= EOL_SEEN_LF;
6823 else if (*p == '\r')
6825 if (p + 1 < pend && *(p + 1) == '\n')
6827 eol_seen |= EOL_SEEN_CRLF;
6828 p++;
6830 else
6831 eol_seen |= EOL_SEEN_CR;
6834 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6835 if ((eol_seen & EOL_SEEN_CRLF) != 0
6836 && (eol_seen & EOL_SEEN_CR) != 0
6837 && (eol_seen & EOL_SEEN_LF) == 0)
6838 eol_seen = EOL_SEEN_CRLF;
6839 else if (eol_seen != EOL_SEEN_NONE
6840 && eol_seen != EOL_SEEN_LF
6841 && eol_seen != EOL_SEEN_CRLF
6842 && eol_seen != EOL_SEEN_CR)
6843 eol_seen = EOL_SEEN_LF;
6844 if (eol_seen != EOL_SEEN_NONE)
6845 eol_type = adjust_coding_eol_type (coding, eol_seen);
6848 if (EQ (eol_type, Qmac))
6850 for (p = pbeg; p < pend; p++)
6851 if (*p == '\r')
6852 *p = '\n';
6854 else if (EQ (eol_type, Qdos))
6856 ptrdiff_t n = 0;
6858 if (NILP (coding->dst_object))
6860 /* Start deleting '\r' from the tail to minimize the memory
6861 movement. */
6862 for (p = pend - 2; p >= pbeg; p--)
6863 if (*p == '\r')
6865 memmove (p, p + 1, pend-- - p - 1);
6866 n++;
6869 else
6871 ptrdiff_t pos_byte = coding->dst_pos_byte;
6872 ptrdiff_t pos = coding->dst_pos;
6873 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6875 while (pos < pos_end)
6877 p = BYTE_POS_ADDR (pos_byte);
6878 if (*p == '\r' && p[1] == '\n')
6880 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6881 n++;
6882 pos_end--;
6884 pos++;
6885 if (coding->dst_multibyte)
6886 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6887 else
6888 pos_byte++;
6891 coding->produced -= n;
6892 coding->produced_char -= n;
6897 /* Return a translation table (or list of them) from coding system
6898 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6899 not ENCODEP). */
6901 static Lisp_Object
6902 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6904 Lisp_Object standard, translation_table;
6905 Lisp_Object val;
6907 if (NILP (Venable_character_translation))
6909 if (max_lookup)
6910 *max_lookup = 0;
6911 return Qnil;
6913 if (encodep)
6914 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6915 standard = Vstandard_translation_table_for_encode;
6916 else
6917 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6918 standard = Vstandard_translation_table_for_decode;
6919 if (NILP (translation_table))
6920 translation_table = standard;
6921 else
6923 if (SYMBOLP (translation_table))
6924 translation_table = Fget (translation_table, Qtranslation_table);
6925 else if (CONSP (translation_table))
6927 translation_table = Fcopy_sequence (translation_table);
6928 for (val = translation_table; CONSP (val); val = XCDR (val))
6929 if (SYMBOLP (XCAR (val)))
6930 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6932 if (CHAR_TABLE_P (standard))
6934 if (CONSP (translation_table))
6935 translation_table = nconc2 (translation_table, list1 (standard));
6936 else
6937 translation_table = list2 (translation_table, standard);
6941 if (max_lookup)
6943 *max_lookup = 1;
6944 if (CHAR_TABLE_P (translation_table)
6945 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6947 val = XCHAR_TABLE (translation_table)->extras[1];
6948 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6949 *max_lookup = XFASTINT (val);
6951 else if (CONSP (translation_table))
6953 Lisp_Object tail;
6955 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6956 if (CHAR_TABLE_P (XCAR (tail))
6957 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6959 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6960 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6961 *max_lookup = XFASTINT (tailval);
6965 return translation_table;
6968 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6969 do { \
6970 trans = Qnil; \
6971 if (CHAR_TABLE_P (table)) \
6973 trans = CHAR_TABLE_REF (table, c); \
6974 if (CHARACTERP (trans)) \
6975 c = XFASTINT (trans), trans = Qnil; \
6977 else if (CONSP (table)) \
6979 Lisp_Object tail; \
6981 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6982 if (CHAR_TABLE_P (XCAR (tail))) \
6984 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6985 if (CHARACTERP (trans)) \
6986 c = XFASTINT (trans), trans = Qnil; \
6987 else if (! NILP (trans)) \
6988 break; \
6991 } while (0)
6994 /* Return a translation of character(s) at BUF according to TRANS.
6995 TRANS is TO-CHAR or ((FROM . TO) ...) where
6996 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6997 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6998 translation is found, and Qnil if not found..
6999 If BUF is too short to lookup characters in FROM, return Qt. */
7001 static Lisp_Object
7002 get_translation (Lisp_Object trans, int *buf, int *buf_end)
7005 if (INTEGERP (trans))
7006 return trans;
7007 for (; CONSP (trans); trans = XCDR (trans))
7009 Lisp_Object val = XCAR (trans);
7010 Lisp_Object from = XCAR (val);
7011 ptrdiff_t len = ASIZE (from);
7012 ptrdiff_t i;
7014 for (i = 0; i < len; i++)
7016 if (buf + i == buf_end)
7017 return Qt;
7018 if (XINT (AREF (from, i)) != buf[i])
7019 break;
7021 if (i == len)
7022 return val;
7024 return Qnil;
7028 static int
7029 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7030 bool last_block)
7032 unsigned char *dst = coding->destination + coding->produced;
7033 unsigned char *dst_end = coding->destination + coding->dst_bytes;
7034 ptrdiff_t produced;
7035 ptrdiff_t produced_chars = 0;
7036 int carryover = 0;
7038 if (! coding->chars_at_source)
7040 /* Source characters are in coding->charbuf. */
7041 int *buf = coding->charbuf;
7042 int *buf_end = buf + coding->charbuf_used;
7044 if (EQ (coding->src_object, coding->dst_object))
7046 coding_set_source (coding);
7047 dst_end = ((unsigned char *) coding->source) + coding->consumed;
7050 while (buf < buf_end)
7052 int c = *buf;
7053 ptrdiff_t i;
7055 if (c >= 0)
7057 ptrdiff_t from_nchars = 1, to_nchars = 1;
7058 Lisp_Object trans = Qnil;
7060 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7061 if (! NILP (trans))
7063 trans = get_translation (trans, buf, buf_end);
7064 if (INTEGERP (trans))
7065 c = XINT (trans);
7066 else if (CONSP (trans))
7068 from_nchars = ASIZE (XCAR (trans));
7069 trans = XCDR (trans);
7070 if (INTEGERP (trans))
7071 c = XINT (trans);
7072 else
7074 to_nchars = ASIZE (trans);
7075 c = XINT (AREF (trans, 0));
7078 else if (EQ (trans, Qt) && ! last_block)
7079 break;
7082 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7084 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7085 / MAX_MULTIBYTE_LENGTH)
7086 < to_nchars)
7087 memory_full (SIZE_MAX);
7088 dst = alloc_destination (coding,
7089 buf_end - buf
7090 + MAX_MULTIBYTE_LENGTH * to_nchars,
7091 dst);
7092 if (EQ (coding->src_object, coding->dst_object))
7094 coding_set_source (coding);
7095 dst_end = (((unsigned char *) coding->source)
7096 + coding->consumed);
7098 else
7099 dst_end = coding->destination + coding->dst_bytes;
7102 for (i = 0; i < to_nchars; i++)
7104 if (i > 0)
7105 c = XINT (AREF (trans, i));
7106 if (coding->dst_multibyte
7107 || ! CHAR_BYTE8_P (c))
7108 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7109 else
7110 *dst++ = CHAR_TO_BYTE8 (c);
7112 produced_chars += to_nchars;
7113 buf += from_nchars;
7115 else
7116 /* This is an annotation datum. (-C) is the length. */
7117 buf += -c;
7119 carryover = buf_end - buf;
7121 else
7123 /* Source characters are at coding->source. */
7124 const unsigned char *src = coding->source;
7125 const unsigned char *src_end = src + coding->consumed;
7127 if (EQ (coding->dst_object, coding->src_object))
7128 dst_end = (unsigned char *) src;
7129 if (coding->src_multibyte != coding->dst_multibyte)
7131 if (coding->src_multibyte)
7133 bool multibytep = 1;
7134 ptrdiff_t consumed_chars = 0;
7136 while (1)
7138 const unsigned char *src_base = src;
7139 int c;
7141 ONE_MORE_BYTE (c);
7142 if (dst == dst_end)
7144 if (EQ (coding->src_object, coding->dst_object))
7145 dst_end = (unsigned char *) src;
7146 if (dst == dst_end)
7148 ptrdiff_t offset = src - coding->source;
7150 dst = alloc_destination (coding, src_end - src + 1,
7151 dst);
7152 dst_end = coding->destination + coding->dst_bytes;
7153 coding_set_source (coding);
7154 src = coding->source + offset;
7155 src_end = coding->source + coding->consumed;
7156 if (EQ (coding->src_object, coding->dst_object))
7157 dst_end = (unsigned char *) src;
7160 *dst++ = c;
7161 produced_chars++;
7163 no_more_source:
7166 else
7167 while (src < src_end)
7169 bool multibytep = 1;
7170 int c = *src++;
7172 if (dst >= dst_end - 1)
7174 if (EQ (coding->src_object, coding->dst_object))
7175 dst_end = (unsigned char *) src;
7176 if (dst >= dst_end - 1)
7178 ptrdiff_t offset = src - coding->source;
7179 ptrdiff_t more_bytes;
7181 if (EQ (coding->src_object, coding->dst_object))
7182 more_bytes = ((src_end - src) / 2) + 2;
7183 else
7184 more_bytes = src_end - src + 2;
7185 dst = alloc_destination (coding, more_bytes, dst);
7186 dst_end = coding->destination + coding->dst_bytes;
7187 coding_set_source (coding);
7188 src = coding->source + offset;
7189 src_end = coding->source + coding->consumed;
7190 if (EQ (coding->src_object, coding->dst_object))
7191 dst_end = (unsigned char *) src;
7194 EMIT_ONE_BYTE (c);
7197 else
7199 if (!EQ (coding->src_object, coding->dst_object))
7201 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7203 if (require > 0)
7205 ptrdiff_t offset = src - coding->source;
7207 dst = alloc_destination (coding, require, dst);
7208 coding_set_source (coding);
7209 src = coding->source + offset;
7210 src_end = coding->source + coding->consumed;
7213 produced_chars = coding->consumed_char;
7214 while (src < src_end)
7215 *dst++ = *src++;
7219 produced = dst - (coding->destination + coding->produced);
7220 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7221 insert_from_gap (produced_chars, produced, 0);
7222 coding->produced += produced;
7223 coding->produced_char += produced_chars;
7224 return carryover;
7227 /* Compose text in CODING->object according to the annotation data at
7228 CHARBUF. CHARBUF is an array:
7229 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7232 static void
7233 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7235 int len;
7236 ptrdiff_t to;
7237 enum composition_method method;
7238 Lisp_Object components;
7240 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7241 to = pos + charbuf[2];
7242 method = (enum composition_method) (charbuf[4]);
7244 if (method == COMPOSITION_RELATIVE)
7245 components = Qnil;
7246 else
7248 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7249 int i, j;
7251 if (method == COMPOSITION_WITH_RULE)
7252 len = charbuf[2] * 3 - 2;
7253 charbuf += MAX_ANNOTATION_LENGTH;
7254 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7255 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7257 if (charbuf[i] >= 0)
7258 args[j] = make_number (charbuf[i]);
7259 else
7261 i++;
7262 args[j] = make_number (charbuf[i] % 0x100);
7265 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7267 compose_text (pos, to, components, Qnil, coding->dst_object);
7271 /* Put `charset' property on text in CODING->object according to
7272 the annotation data at CHARBUF. CHARBUF is an array:
7273 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7276 static void
7277 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7279 ptrdiff_t from = pos - charbuf[2];
7280 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7282 Fput_text_property (make_number (from), make_number (pos),
7283 Qcharset, CHARSET_NAME (charset),
7284 coding->dst_object);
7288 #define CHARBUF_SIZE 0x4000
7290 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7291 do { \
7292 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7293 coding->charbuf_size = CHARBUF_SIZE; \
7294 } while (0)
7297 static void
7298 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7300 int *charbuf = coding->charbuf;
7301 int *charbuf_end = charbuf + coding->charbuf_used;
7303 if (NILP (coding->dst_object))
7304 return;
7306 while (charbuf < charbuf_end)
7308 if (*charbuf >= 0)
7309 pos++, charbuf++;
7310 else
7312 int len = -*charbuf;
7314 if (len > 2)
7315 switch (charbuf[1])
7317 case CODING_ANNOTATE_COMPOSITION_MASK:
7318 produce_composition (coding, charbuf, pos);
7319 break;
7320 case CODING_ANNOTATE_CHARSET_MASK:
7321 produce_charset (coding, charbuf, pos);
7322 break;
7324 charbuf += len;
7329 /* Decode the data at CODING->src_object into CODING->dst_object.
7330 CODING->src_object is a buffer, a string, or nil.
7331 CODING->dst_object is a buffer.
7333 If CODING->src_object is a buffer, it must be the current buffer.
7334 In this case, if CODING->src_pos is positive, it is a position of
7335 the source text in the buffer, otherwise, the source text is in the
7336 gap area of the buffer, and CODING->src_pos specifies the offset of
7337 the text from GPT (which must be the same as PT). If this is the
7338 same buffer as CODING->dst_object, CODING->src_pos must be
7339 negative.
7341 If CODING->src_object is a string, CODING->src_pos is an index to
7342 that string.
7344 If CODING->src_object is nil, CODING->source must already point to
7345 the non-relocatable memory area. In this case, CODING->src_pos is
7346 an offset from CODING->source.
7348 The decoded data is inserted at the current point of the buffer
7349 CODING->dst_object.
7352 static void
7353 decode_coding (struct coding_system *coding)
7355 Lisp_Object attrs;
7356 Lisp_Object undo_list;
7357 Lisp_Object translation_table;
7358 struct ccl_spec cclspec;
7359 int carryover;
7360 int i;
7362 USE_SAFE_ALLOCA;
7364 if (BUFFERP (coding->src_object)
7365 && coding->src_pos > 0
7366 && coding->src_pos < GPT
7367 && coding->src_pos + coding->src_chars > GPT)
7368 move_gap_both (coding->src_pos, coding->src_pos_byte);
7370 undo_list = Qt;
7371 if (BUFFERP (coding->dst_object))
7373 set_buffer_internal (XBUFFER (coding->dst_object));
7374 if (GPT != PT)
7375 move_gap_both (PT, PT_BYTE);
7377 /* We must disable undo_list in order to record the whole insert
7378 transaction via record_insert at the end. But doing so also
7379 disables the recording of the first change to the undo_list.
7380 Therefore we check for first change here and record it via
7381 record_first_change if needed. */
7382 if (MODIFF <= SAVE_MODIFF)
7383 record_first_change ();
7385 undo_list = BVAR (current_buffer, undo_list);
7386 bset_undo_list (current_buffer, Qt);
7389 coding->consumed = coding->consumed_char = 0;
7390 coding->produced = coding->produced_char = 0;
7391 coding->chars_at_source = 0;
7392 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7393 coding->errors = 0;
7395 ALLOC_CONVERSION_WORK_AREA (coding);
7397 attrs = CODING_ID_ATTRS (coding->id);
7398 translation_table = get_translation_table (attrs, 0, NULL);
7400 carryover = 0;
7401 if (coding->decoder == decode_coding_ccl)
7403 coding->spec.ccl = &cclspec;
7404 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7408 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7410 coding_set_source (coding);
7411 coding->annotated = 0;
7412 coding->charbuf_used = carryover;
7413 (*(coding->decoder)) (coding);
7414 coding_set_destination (coding);
7415 carryover = produce_chars (coding, translation_table, 0);
7416 if (coding->annotated)
7417 produce_annotation (coding, pos);
7418 for (i = 0; i < carryover; i++)
7419 coding->charbuf[i]
7420 = coding->charbuf[coding->charbuf_used - carryover + i];
7422 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7423 || (coding->consumed < coding->src_bytes
7424 && (coding->result == CODING_RESULT_SUCCESS
7425 || coding->result == CODING_RESULT_INVALID_SRC)));
7427 if (carryover > 0)
7429 coding_set_destination (coding);
7430 coding->charbuf_used = carryover;
7431 produce_chars (coding, translation_table, 1);
7434 coding->carryover_bytes = 0;
7435 if (coding->consumed < coding->src_bytes)
7437 ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7438 const unsigned char *src;
7440 coding_set_source (coding);
7441 coding_set_destination (coding);
7442 src = coding->source + coding->consumed;
7444 if (coding->mode & CODING_MODE_LAST_BLOCK)
7446 /* Flush out unprocessed data as binary chars. We are sure
7447 that the number of data is less than the size of
7448 coding->charbuf. */
7449 coding->charbuf_used = 0;
7450 coding->chars_at_source = 0;
7452 while (nbytes-- > 0)
7454 int c = *src++;
7456 if (c & 0x80)
7457 c = BYTE8_TO_CHAR (c);
7458 coding->charbuf[coding->charbuf_used++] = c;
7460 produce_chars (coding, Qnil, 1);
7462 else
7464 /* Record unprocessed bytes in coding->carryover. We are
7465 sure that the number of data is less than the size of
7466 coding->carryover. */
7467 unsigned char *p = coding->carryover;
7469 if (nbytes > sizeof coding->carryover)
7470 nbytes = sizeof coding->carryover;
7471 coding->carryover_bytes = nbytes;
7472 while (nbytes-- > 0)
7473 *p++ = *src++;
7475 coding->consumed = coding->src_bytes;
7478 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7479 && !inhibit_eol_conversion)
7480 decode_eol (coding);
7481 if (BUFFERP (coding->dst_object))
7483 bset_undo_list (current_buffer, undo_list);
7484 record_insert (coding->dst_pos, coding->produced_char);
7487 SAFE_FREE ();
7491 /* Extract an annotation datum from a composition starting at POS and
7492 ending before LIMIT of CODING->src_object (buffer or string), store
7493 the data in BUF, set *STOP to a starting position of the next
7494 composition (if any) or to LIMIT, and return the address of the
7495 next element of BUF.
7497 If such an annotation is not found, set *STOP to a starting
7498 position of a composition after POS (if any) or to LIMIT, and
7499 return BUF. */
7501 static int *
7502 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7503 struct coding_system *coding, int *buf,
7504 ptrdiff_t *stop)
7506 ptrdiff_t start, end;
7507 Lisp_Object prop;
7509 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7510 || end > limit)
7511 *stop = limit;
7512 else if (start > pos)
7513 *stop = start;
7514 else
7516 if (start == pos)
7518 /* We found a composition. Store the corresponding
7519 annotation data in BUF. */
7520 int *head = buf;
7521 enum composition_method method = composition_method (prop);
7522 int nchars = COMPOSITION_LENGTH (prop);
7524 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7525 if (method != COMPOSITION_RELATIVE)
7527 Lisp_Object components;
7528 ptrdiff_t i, len, i_byte;
7530 components = COMPOSITION_COMPONENTS (prop);
7531 if (VECTORP (components))
7533 len = ASIZE (components);
7534 for (i = 0; i < len; i++)
7535 *buf++ = XINT (AREF (components, i));
7537 else if (STRINGP (components))
7539 len = SCHARS (components);
7540 i = i_byte = 0;
7541 while (i < len)
7543 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7544 buf++;
7547 else if (INTEGERP (components))
7549 len = 1;
7550 *buf++ = XINT (components);
7552 else if (CONSP (components))
7554 for (len = 0; CONSP (components);
7555 len++, components = XCDR (components))
7556 *buf++ = XINT (XCAR (components));
7558 else
7559 emacs_abort ();
7560 *head -= len;
7564 if (find_composition (end, limit, &start, &end, &prop,
7565 coding->src_object)
7566 && end <= limit)
7567 *stop = start;
7568 else
7569 *stop = limit;
7571 return buf;
7575 /* Extract an annotation datum from a text property `charset' at POS of
7576 CODING->src_object (buffer of string), store the data in BUF, set
7577 *STOP to the position where the value of `charset' property changes
7578 (limiting by LIMIT), and return the address of the next element of
7579 BUF.
7581 If the property value is nil, set *STOP to the position where the
7582 property value is non-nil (limiting by LIMIT), and return BUF. */
7584 static int *
7585 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7586 struct coding_system *coding, int *buf,
7587 ptrdiff_t *stop)
7589 Lisp_Object val, next;
7590 int id;
7592 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7593 if (! NILP (val) && CHARSETP (val))
7594 id = XINT (CHARSET_SYMBOL_ID (val));
7595 else
7596 id = -1;
7597 ADD_CHARSET_DATA (buf, 0, id);
7598 next = Fnext_single_property_change (make_number (pos), Qcharset,
7599 coding->src_object,
7600 make_number (limit));
7601 *stop = XINT (next);
7602 return buf;
7606 static void
7607 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7608 int max_lookup)
7610 int *buf = coding->charbuf;
7611 int *buf_end = coding->charbuf + coding->charbuf_size;
7612 const unsigned char *src = coding->source + coding->consumed;
7613 const unsigned char *src_end = coding->source + coding->src_bytes;
7614 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7615 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7616 bool multibytep = coding->src_multibyte;
7617 Lisp_Object eol_type;
7618 int c;
7619 ptrdiff_t stop, stop_composition, stop_charset;
7620 int *lookup_buf = NULL;
7622 if (! NILP (translation_table))
7623 lookup_buf = alloca (sizeof (int) * max_lookup);
7625 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7626 if (VECTORP (eol_type))
7627 eol_type = Qunix;
7629 /* Note: composition handling is not yet implemented. */
7630 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7632 if (NILP (coding->src_object))
7633 stop = stop_composition = stop_charset = end_pos;
7634 else
7636 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7637 stop = stop_composition = pos;
7638 else
7639 stop = stop_composition = end_pos;
7640 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7641 stop = stop_charset = pos;
7642 else
7643 stop_charset = end_pos;
7646 /* Compensate for CRLF and conversion. */
7647 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7648 while (buf < buf_end)
7650 Lisp_Object trans;
7652 if (pos == stop)
7654 if (pos == end_pos)
7655 break;
7656 if (pos == stop_composition)
7657 buf = handle_composition_annotation (pos, end_pos, coding,
7658 buf, &stop_composition);
7659 if (pos == stop_charset)
7660 buf = handle_charset_annotation (pos, end_pos, coding,
7661 buf, &stop_charset);
7662 stop = (stop_composition < stop_charset
7663 ? stop_composition : stop_charset);
7666 if (! multibytep)
7668 int bytes;
7670 if (coding->encoder == encode_coding_raw_text
7671 || coding->encoder == encode_coding_ccl)
7672 c = *src++, pos++;
7673 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7674 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7675 else
7676 c = BYTE8_TO_CHAR (*src), src++, pos++;
7678 else
7679 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7680 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7681 c = '\n';
7682 if (! EQ (eol_type, Qunix))
7684 if (c == '\n')
7686 if (EQ (eol_type, Qdos))
7687 *buf++ = '\r';
7688 else
7689 c = '\r';
7693 trans = Qnil;
7694 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7695 if (NILP (trans))
7696 *buf++ = c;
7697 else
7699 ptrdiff_t from_nchars = 1, to_nchars = 1;
7700 int *lookup_buf_end;
7701 const unsigned char *p = src;
7702 int i;
7704 lookup_buf[0] = c;
7705 for (i = 1; i < max_lookup && p < src_end; i++)
7706 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7707 lookup_buf_end = lookup_buf + i;
7708 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7709 if (INTEGERP (trans))
7710 c = XINT (trans);
7711 else if (CONSP (trans))
7713 from_nchars = ASIZE (XCAR (trans));
7714 trans = XCDR (trans);
7715 if (INTEGERP (trans))
7716 c = XINT (trans);
7717 else
7719 to_nchars = ASIZE (trans);
7720 if (buf_end - buf < to_nchars)
7721 break;
7722 c = XINT (AREF (trans, 0));
7725 else
7726 break;
7727 *buf++ = c;
7728 for (i = 1; i < to_nchars; i++)
7729 *buf++ = XINT (AREF (trans, i));
7730 for (i = 1; i < from_nchars; i++, pos++)
7731 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7735 coding->consumed = src - coding->source;
7736 coding->consumed_char = pos - coding->src_pos;
7737 coding->charbuf_used = buf - coding->charbuf;
7738 coding->chars_at_source = 0;
7742 /* Encode the text at CODING->src_object into CODING->dst_object.
7743 CODING->src_object is a buffer or a string.
7744 CODING->dst_object is a buffer or nil.
7746 If CODING->src_object is a buffer, it must be the current buffer.
7747 In this case, if CODING->src_pos is positive, it is a position of
7748 the source text in the buffer, otherwise. the source text is in the
7749 gap area of the buffer, and coding->src_pos specifies the offset of
7750 the text from GPT (which must be the same as PT). If this is the
7751 same buffer as CODING->dst_object, CODING->src_pos must be
7752 negative and CODING should not have `pre-write-conversion'.
7754 If CODING->src_object is a string, CODING should not have
7755 `pre-write-conversion'.
7757 If CODING->dst_object is a buffer, the encoded data is inserted at
7758 the current point of that buffer.
7760 If CODING->dst_object is nil, the encoded data is placed at the
7761 memory area specified by CODING->destination. */
7763 static void
7764 encode_coding (struct coding_system *coding)
7766 Lisp_Object attrs;
7767 Lisp_Object translation_table;
7768 int max_lookup;
7769 struct ccl_spec cclspec;
7771 USE_SAFE_ALLOCA;
7773 attrs = CODING_ID_ATTRS (coding->id);
7774 if (coding->encoder == encode_coding_raw_text)
7775 translation_table = Qnil, max_lookup = 0;
7776 else
7777 translation_table = get_translation_table (attrs, 1, &max_lookup);
7779 if (BUFFERP (coding->dst_object))
7781 set_buffer_internal (XBUFFER (coding->dst_object));
7782 coding->dst_multibyte
7783 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7786 coding->consumed = coding->consumed_char = 0;
7787 coding->produced = coding->produced_char = 0;
7788 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7789 coding->errors = 0;
7791 ALLOC_CONVERSION_WORK_AREA (coding);
7793 if (coding->encoder == encode_coding_ccl)
7795 coding->spec.ccl = &cclspec;
7796 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7798 do {
7799 coding_set_source (coding);
7800 consume_chars (coding, translation_table, max_lookup);
7801 coding_set_destination (coding);
7802 (*(coding->encoder)) (coding);
7803 } while (coding->consumed_char < coding->src_chars);
7805 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7806 insert_from_gap (coding->produced_char, coding->produced, 0);
7808 SAFE_FREE ();
7812 /* Name (or base name) of work buffer for code conversion. */
7813 static Lisp_Object Vcode_conversion_workbuf_name;
7815 /* A working buffer used by the top level conversion. Once it is
7816 created, it is never destroyed. It has the name
7817 Vcode_conversion_workbuf_name. The other working buffers are
7818 destroyed after the use is finished, and their names are modified
7819 versions of Vcode_conversion_workbuf_name. */
7820 static Lisp_Object Vcode_conversion_reused_workbuf;
7822 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7823 static bool reused_workbuf_in_use;
7826 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7827 multibyteness of returning buffer. */
7829 static Lisp_Object
7830 make_conversion_work_buffer (bool multibyte)
7832 Lisp_Object name, workbuf;
7833 struct buffer *current;
7835 if (reused_workbuf_in_use)
7837 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7838 workbuf = Fget_buffer_create (name);
7840 else
7842 reused_workbuf_in_use = 1;
7843 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7844 Vcode_conversion_reused_workbuf
7845 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7846 workbuf = Vcode_conversion_reused_workbuf;
7848 current = current_buffer;
7849 set_buffer_internal (XBUFFER (workbuf));
7850 /* We can't allow modification hooks to run in the work buffer. For
7851 instance, directory_files_internal assumes that file decoding
7852 doesn't compile new regexps. */
7853 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7854 Ferase_buffer ();
7855 bset_undo_list (current_buffer, Qt);
7856 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7857 set_buffer_internal (current);
7858 return workbuf;
7862 static void
7863 code_conversion_restore (Lisp_Object arg)
7865 Lisp_Object current, workbuf;
7866 struct gcpro gcpro1;
7868 GCPRO1 (arg);
7869 current = XCAR (arg);
7870 workbuf = XCDR (arg);
7871 if (! NILP (workbuf))
7873 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7874 reused_workbuf_in_use = 0;
7875 else
7876 Fkill_buffer (workbuf);
7878 set_buffer_internal (XBUFFER (current));
7879 UNGCPRO;
7882 Lisp_Object
7883 code_conversion_save (bool with_work_buf, bool multibyte)
7885 Lisp_Object workbuf = Qnil;
7887 if (with_work_buf)
7888 workbuf = make_conversion_work_buffer (multibyte);
7889 record_unwind_protect (code_conversion_restore,
7890 Fcons (Fcurrent_buffer (), workbuf));
7891 return workbuf;
7894 void
7895 decode_coding_gap (struct coding_system *coding,
7896 ptrdiff_t chars, ptrdiff_t bytes)
7898 ptrdiff_t count = SPECPDL_INDEX ();
7899 Lisp_Object attrs;
7901 coding->src_object = Fcurrent_buffer ();
7902 coding->src_chars = chars;
7903 coding->src_bytes = bytes;
7904 coding->src_pos = -chars;
7905 coding->src_pos_byte = -bytes;
7906 coding->src_multibyte = chars < bytes;
7907 coding->dst_object = coding->src_object;
7908 coding->dst_pos = PT;
7909 coding->dst_pos_byte = PT_BYTE;
7910 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7912 coding->head_ascii = -1;
7913 coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7914 coding->eol_seen = EOL_SEEN_NONE;
7915 if (CODING_REQUIRE_DETECTION (coding))
7916 detect_coding (coding);
7917 attrs = CODING_ID_ATTRS (coding->id);
7918 if (! disable_ascii_optimization
7919 && ! coding->src_multibyte
7920 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7921 && NILP (CODING_ATTR_POST_READ (attrs))
7922 && NILP (get_translation_table (attrs, 0, NULL)))
7924 chars = coding->head_ascii;
7925 if (chars < 0)
7926 chars = check_ascii (coding);
7927 if (chars != bytes)
7929 /* There exists a non-ASCII byte. */
7930 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7931 && coding->detected_utf8_bytes == coding->src_bytes)
7933 if (coding->detected_utf8_chars >= 0)
7934 chars = coding->detected_utf8_chars;
7935 else
7936 chars = check_utf_8 (coding);
7937 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7938 && coding->head_ascii == 0
7939 && coding->source[0] == UTF_8_BOM_1
7940 && coding->source[1] == UTF_8_BOM_2
7941 && coding->source[2] == UTF_8_BOM_3)
7943 chars--;
7944 bytes -= 3;
7945 coding->src_bytes -= 3;
7948 else
7949 chars = -1;
7951 if (chars >= 0)
7953 Lisp_Object eol_type;
7955 eol_type = CODING_ID_EOL_TYPE (coding->id);
7956 if (VECTORP (eol_type))
7958 if (coding->eol_seen != EOL_SEEN_NONE)
7959 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7961 if (EQ (eol_type, Qmac))
7963 unsigned char *src_end = GAP_END_ADDR;
7964 unsigned char *src = src_end - coding->src_bytes;
7966 while (src < src_end)
7968 if (*src++ == '\r')
7969 src[-1] = '\n';
7972 else if (EQ (eol_type, Qdos))
7974 unsigned char *src = GAP_END_ADDR;
7975 unsigned char *src_beg = src - coding->src_bytes;
7976 unsigned char *dst = src;
7977 ptrdiff_t diff;
7979 while (src_beg < src)
7981 *--dst = *--src;
7982 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7983 src--;
7985 diff = dst - src;
7986 bytes -= diff;
7987 chars -= diff;
7989 coding->produced = bytes;
7990 coding->produced_char = chars;
7991 insert_from_gap (chars, bytes, 1);
7992 return;
7995 code_conversion_save (0, 0);
7997 coding->mode |= CODING_MODE_LAST_BLOCK;
7998 current_buffer->text->inhibit_shrinking = 1;
7999 decode_coding (coding);
8000 current_buffer->text->inhibit_shrinking = 0;
8002 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8004 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8005 Lisp_Object val;
8007 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8008 val = call1 (CODING_ATTR_POST_READ (attrs),
8009 make_number (coding->produced_char));
8010 CHECK_NATNUM (val);
8011 coding->produced_char += Z - prev_Z;
8012 coding->produced += Z_BYTE - prev_Z_BYTE;
8015 unbind_to (count, Qnil);
8019 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8020 SRC_OBJECT into DST_OBJECT by coding context CODING.
8022 SRC_OBJECT is a buffer, a string, or Qnil.
8024 If it is a buffer, the text is at point of the buffer. FROM and TO
8025 are positions in the buffer.
8027 If it is a string, the text is at the beginning of the string.
8028 FROM and TO are indices to the string.
8030 If it is nil, the text is at coding->source. FROM and TO are
8031 indices to coding->source.
8033 DST_OBJECT is a buffer, Qt, or Qnil.
8035 If it is a buffer, the decoded text is inserted at point of the
8036 buffer. If the buffer is the same as SRC_OBJECT, the source text
8037 is deleted.
8039 If it is Qt, a string is made from the decoded text, and
8040 set in CODING->dst_object.
8042 If it is Qnil, the decoded text is stored at CODING->destination.
8043 The caller must allocate CODING->dst_bytes bytes at
8044 CODING->destination by xmalloc. If the decoded text is longer than
8045 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8048 void
8049 decode_coding_object (struct coding_system *coding,
8050 Lisp_Object src_object,
8051 ptrdiff_t from, ptrdiff_t from_byte,
8052 ptrdiff_t to, ptrdiff_t to_byte,
8053 Lisp_Object dst_object)
8055 ptrdiff_t count = SPECPDL_INDEX ();
8056 unsigned char *destination IF_LINT (= NULL);
8057 ptrdiff_t dst_bytes IF_LINT (= 0);
8058 ptrdiff_t chars = to - from;
8059 ptrdiff_t bytes = to_byte - from_byte;
8060 Lisp_Object attrs;
8061 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8062 bool need_marker_adjustment = 0;
8063 Lisp_Object old_deactivate_mark;
8065 old_deactivate_mark = Vdeactivate_mark;
8067 if (NILP (dst_object))
8069 destination = coding->destination;
8070 dst_bytes = coding->dst_bytes;
8073 coding->src_object = src_object;
8074 coding->src_chars = chars;
8075 coding->src_bytes = bytes;
8076 coding->src_multibyte = chars < bytes;
8078 if (STRINGP (src_object))
8080 coding->src_pos = from;
8081 coding->src_pos_byte = from_byte;
8083 else if (BUFFERP (src_object))
8085 set_buffer_internal (XBUFFER (src_object));
8086 if (from != GPT)
8087 move_gap_both (from, from_byte);
8088 if (EQ (src_object, dst_object))
8090 struct Lisp_Marker *tail;
8092 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8094 tail->need_adjustment
8095 = tail->charpos == (tail->insertion_type ? from : to);
8096 need_marker_adjustment |= tail->need_adjustment;
8098 saved_pt = PT, saved_pt_byte = PT_BYTE;
8099 TEMP_SET_PT_BOTH (from, from_byte);
8100 current_buffer->text->inhibit_shrinking = 1;
8101 del_range_both (from, from_byte, to, to_byte, 1);
8102 coding->src_pos = -chars;
8103 coding->src_pos_byte = -bytes;
8105 else
8107 coding->src_pos = from;
8108 coding->src_pos_byte = from_byte;
8112 if (CODING_REQUIRE_DETECTION (coding))
8113 detect_coding (coding);
8114 attrs = CODING_ID_ATTRS (coding->id);
8116 if (EQ (dst_object, Qt)
8117 || (! NILP (CODING_ATTR_POST_READ (attrs))
8118 && NILP (dst_object)))
8120 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8121 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8122 coding->dst_pos = BEG;
8123 coding->dst_pos_byte = BEG_BYTE;
8125 else if (BUFFERP (dst_object))
8127 code_conversion_save (0, 0);
8128 coding->dst_object = dst_object;
8129 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8130 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8131 coding->dst_multibyte
8132 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8134 else
8136 code_conversion_save (0, 0);
8137 coding->dst_object = Qnil;
8138 /* Most callers presume this will return a multibyte result, and they
8139 won't use `binary' or `raw-text' anyway, so let's not worry about
8140 CODING_FOR_UNIBYTE. */
8141 coding->dst_multibyte = 1;
8144 decode_coding (coding);
8146 if (BUFFERP (coding->dst_object))
8147 set_buffer_internal (XBUFFER (coding->dst_object));
8149 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8151 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8152 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8153 Lisp_Object val;
8155 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8156 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8157 old_deactivate_mark);
8158 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8159 make_number (coding->produced_char));
8160 UNGCPRO;
8161 CHECK_NATNUM (val);
8162 coding->produced_char += Z - prev_Z;
8163 coding->produced += Z_BYTE - prev_Z_BYTE;
8166 if (EQ (dst_object, Qt))
8168 coding->dst_object = Fbuffer_string ();
8170 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8172 set_buffer_internal (XBUFFER (coding->dst_object));
8173 if (dst_bytes < coding->produced)
8175 eassert (coding->produced > 0);
8176 destination = xrealloc (destination, coding->produced);
8177 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8178 move_gap_both (BEGV, BEGV_BYTE);
8179 memcpy (destination, BEGV_ADDR, coding->produced);
8180 coding->destination = destination;
8184 if (saved_pt >= 0)
8186 /* This is the case of:
8187 (BUFFERP (src_object) && EQ (src_object, dst_object))
8188 As we have moved PT while replacing the original buffer
8189 contents, we must recover it now. */
8190 set_buffer_internal (XBUFFER (src_object));
8191 current_buffer->text->inhibit_shrinking = 0;
8192 if (saved_pt < from)
8193 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8194 else if (saved_pt < from + chars)
8195 TEMP_SET_PT_BOTH (from, from_byte);
8196 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8197 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8198 saved_pt_byte + (coding->produced - bytes));
8199 else
8200 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8201 saved_pt_byte + (coding->produced - bytes));
8203 if (need_marker_adjustment)
8205 struct Lisp_Marker *tail;
8207 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8208 if (tail->need_adjustment)
8210 tail->need_adjustment = 0;
8211 if (tail->insertion_type)
8213 tail->bytepos = from_byte;
8214 tail->charpos = from;
8216 else
8218 tail->bytepos = from_byte + coding->produced;
8219 tail->charpos
8220 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8221 ? tail->bytepos : from + coding->produced_char);
8227 Vdeactivate_mark = old_deactivate_mark;
8228 unbind_to (count, coding->dst_object);
8232 void
8233 encode_coding_object (struct coding_system *coding,
8234 Lisp_Object src_object,
8235 ptrdiff_t from, ptrdiff_t from_byte,
8236 ptrdiff_t to, ptrdiff_t to_byte,
8237 Lisp_Object dst_object)
8239 ptrdiff_t count = SPECPDL_INDEX ();
8240 ptrdiff_t chars = to - from;
8241 ptrdiff_t bytes = to_byte - from_byte;
8242 Lisp_Object attrs;
8243 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8244 bool need_marker_adjustment = 0;
8245 bool kill_src_buffer = 0;
8246 Lisp_Object old_deactivate_mark;
8248 old_deactivate_mark = Vdeactivate_mark;
8250 coding->src_object = src_object;
8251 coding->src_chars = chars;
8252 coding->src_bytes = bytes;
8253 coding->src_multibyte = chars < bytes;
8255 attrs = CODING_ID_ATTRS (coding->id);
8257 if (EQ (src_object, dst_object))
8259 struct Lisp_Marker *tail;
8261 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8263 tail->need_adjustment
8264 = tail->charpos == (tail->insertion_type ? from : to);
8265 need_marker_adjustment |= tail->need_adjustment;
8269 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8271 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8272 set_buffer_internal (XBUFFER (coding->src_object));
8273 if (STRINGP (src_object))
8274 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8275 else if (BUFFERP (src_object))
8276 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8277 else
8278 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8280 if (EQ (src_object, dst_object))
8282 set_buffer_internal (XBUFFER (src_object));
8283 saved_pt = PT, saved_pt_byte = PT_BYTE;
8284 del_range_both (from, from_byte, to, to_byte, 1);
8285 set_buffer_internal (XBUFFER (coding->src_object));
8289 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8291 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8292 old_deactivate_mark);
8293 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8294 make_number (BEG), make_number (Z));
8295 UNGCPRO;
8297 if (XBUFFER (coding->src_object) != current_buffer)
8298 kill_src_buffer = 1;
8299 coding->src_object = Fcurrent_buffer ();
8300 if (BEG != GPT)
8301 move_gap_both (BEG, BEG_BYTE);
8302 coding->src_chars = Z - BEG;
8303 coding->src_bytes = Z_BYTE - BEG_BYTE;
8304 coding->src_pos = BEG;
8305 coding->src_pos_byte = BEG_BYTE;
8306 coding->src_multibyte = Z < Z_BYTE;
8308 else if (STRINGP (src_object))
8310 code_conversion_save (0, 0);
8311 coding->src_pos = from;
8312 coding->src_pos_byte = from_byte;
8314 else if (BUFFERP (src_object))
8316 code_conversion_save (0, 0);
8317 set_buffer_internal (XBUFFER (src_object));
8318 if (EQ (src_object, dst_object))
8320 saved_pt = PT, saved_pt_byte = PT_BYTE;
8321 coding->src_object = del_range_1 (from, to, 1, 1);
8322 coding->src_pos = 0;
8323 coding->src_pos_byte = 0;
8325 else
8327 if (from < GPT && to >= GPT)
8328 move_gap_both (from, from_byte);
8329 coding->src_pos = from;
8330 coding->src_pos_byte = from_byte;
8333 else
8334 code_conversion_save (0, 0);
8336 if (BUFFERP (dst_object))
8338 coding->dst_object = dst_object;
8339 if (EQ (src_object, dst_object))
8341 coding->dst_pos = from;
8342 coding->dst_pos_byte = from_byte;
8344 else
8346 struct buffer *current = current_buffer;
8348 set_buffer_temp (XBUFFER (dst_object));
8349 coding->dst_pos = PT;
8350 coding->dst_pos_byte = PT_BYTE;
8351 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8352 set_buffer_temp (current);
8354 coding->dst_multibyte
8355 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8357 else if (EQ (dst_object, Qt))
8359 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8360 coding->dst_object = Qnil;
8361 coding->destination = xmalloc (dst_bytes);
8362 coding->dst_bytes = dst_bytes;
8363 coding->dst_multibyte = 0;
8365 else
8367 coding->dst_object = Qnil;
8368 coding->dst_multibyte = 0;
8371 encode_coding (coding);
8373 if (EQ (dst_object, Qt))
8375 if (BUFFERP (coding->dst_object))
8376 coding->dst_object = Fbuffer_string ();
8377 else if (coding->raw_destination)
8378 /* This is used to avoid creating huge Lisp string.
8379 NOTE: caller who sets `raw_destination' is also
8380 responsible for freeing `destination' buffer. */
8381 coding->dst_object = Qnil;
8382 else
8384 coding->dst_object
8385 = make_unibyte_string ((char *) coding->destination,
8386 coding->produced);
8387 xfree (coding->destination);
8391 if (saved_pt >= 0)
8393 /* This is the case of:
8394 (BUFFERP (src_object) && EQ (src_object, dst_object))
8395 As we have moved PT while replacing the original buffer
8396 contents, we must recover it now. */
8397 set_buffer_internal (XBUFFER (src_object));
8398 if (saved_pt < from)
8399 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8400 else if (saved_pt < from + chars)
8401 TEMP_SET_PT_BOTH (from, from_byte);
8402 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8403 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8404 saved_pt_byte + (coding->produced - bytes));
8405 else
8406 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8407 saved_pt_byte + (coding->produced - bytes));
8409 if (need_marker_adjustment)
8411 struct Lisp_Marker *tail;
8413 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8414 if (tail->need_adjustment)
8416 tail->need_adjustment = 0;
8417 if (tail->insertion_type)
8419 tail->bytepos = from_byte;
8420 tail->charpos = from;
8422 else
8424 tail->bytepos = from_byte + coding->produced;
8425 tail->charpos
8426 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8427 ? tail->bytepos : from + coding->produced_char);
8433 if (kill_src_buffer)
8434 Fkill_buffer (coding->src_object);
8436 Vdeactivate_mark = old_deactivate_mark;
8437 unbind_to (count, Qnil);
8441 Lisp_Object
8442 preferred_coding_system (void)
8444 int id = coding_categories[coding_priorities[0]].id;
8446 return CODING_ID_NAME (id);
8449 #if defined (WINDOWSNT) || defined (CYGWIN)
8451 Lisp_Object
8452 from_unicode (Lisp_Object str)
8454 CHECK_STRING (str);
8455 if (!STRING_MULTIBYTE (str) &&
8456 SBYTES (str) & 1)
8458 str = Fsubstring (str, make_number (0), make_number (-1));
8461 return code_convert_string_norecord (str, Qutf_16le, 0);
8464 Lisp_Object
8465 from_unicode_buffer (const wchar_t* wstr)
8467 return from_unicode (
8468 make_unibyte_string (
8469 (char*) wstr,
8470 /* we get one of the two final 0 bytes for free. */
8471 1 + sizeof (wchar_t) * wcslen (wstr)));
8474 wchar_t *
8475 to_unicode (Lisp_Object str, Lisp_Object *buf)
8477 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8478 /* We need to make another copy (in addition to the one made by
8479 code_convert_string_norecord) to ensure that the final string is
8480 _doubly_ zero terminated --- that is, that the string is
8481 terminated by two zero bytes and one utf-16le null character.
8482 Because strings are already terminated with a single zero byte,
8483 we just add one additional zero. */
8484 str = make_uninit_string (SBYTES (*buf) + 1);
8485 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8486 SDATA (str) [SBYTES (*buf)] = '\0';
8487 *buf = str;
8488 return WCSDATA (*buf);
8491 #endif /* WINDOWSNT || CYGWIN */
8494 #ifdef emacs
8495 /*** 8. Emacs Lisp library functions ***/
8497 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8498 doc: /* Return t if OBJECT is nil or a coding-system.
8499 See the documentation of `define-coding-system' for information
8500 about coding-system objects. */)
8501 (Lisp_Object object)
8503 if (NILP (object)
8504 || CODING_SYSTEM_ID (object) >= 0)
8505 return Qt;
8506 if (! SYMBOLP (object)
8507 || NILP (Fget (object, Qcoding_system_define_form)))
8508 return Qnil;
8509 return Qt;
8512 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8513 Sread_non_nil_coding_system, 1, 1, 0,
8514 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8515 (Lisp_Object prompt)
8517 Lisp_Object val;
8520 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8521 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8523 while (SCHARS (val) == 0);
8524 return (Fintern (val, Qnil));
8527 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8528 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8529 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8530 Ignores case when completing coding systems (all Emacs coding systems
8531 are lower-case). */)
8532 (Lisp_Object prompt, Lisp_Object default_coding_system)
8534 Lisp_Object val;
8535 ptrdiff_t count = SPECPDL_INDEX ();
8537 if (SYMBOLP (default_coding_system))
8538 default_coding_system = SYMBOL_NAME (default_coding_system);
8539 specbind (Qcompletion_ignore_case, Qt);
8540 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8541 Qt, Qnil, Qcoding_system_history,
8542 default_coding_system, Qnil);
8543 unbind_to (count, Qnil);
8544 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8547 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8548 1, 1, 0,
8549 doc: /* Check validity of CODING-SYSTEM.
8550 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8551 It is valid if it is nil or a symbol defined as a coding system by the
8552 function `define-coding-system'. */)
8553 (Lisp_Object coding_system)
8555 Lisp_Object define_form;
8557 define_form = Fget (coding_system, Qcoding_system_define_form);
8558 if (! NILP (define_form))
8560 Fput (coding_system, Qcoding_system_define_form, Qnil);
8561 safe_eval (define_form);
8563 if (!NILP (Fcoding_system_p (coding_system)))
8564 return coding_system;
8565 xsignal1 (Qcoding_system_error, coding_system);
8569 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8570 HIGHEST, return the coding system of the highest
8571 priority among the detected coding systems. Otherwise return a
8572 list of detected coding systems sorted by their priorities. If
8573 MULTIBYTEP, it is assumed that the bytes are in correct
8574 multibyte form but contains only ASCII and eight-bit chars.
8575 Otherwise, the bytes are raw bytes.
8577 CODING-SYSTEM controls the detection as below:
8579 If it is nil, detect both text-format and eol-format. If the
8580 text-format part of CODING-SYSTEM is already specified
8581 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8582 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8583 detect only text-format. */
8585 Lisp_Object
8586 detect_coding_system (const unsigned char *src,
8587 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8588 bool highest, bool multibytep,
8589 Lisp_Object coding_system)
8591 const unsigned char *src_end = src + src_bytes;
8592 Lisp_Object attrs, eol_type;
8593 Lisp_Object val = Qnil;
8594 struct coding_system coding;
8595 ptrdiff_t id;
8596 struct coding_detection_info detect_info;
8597 enum coding_category base_category;
8598 bool null_byte_found = 0, eight_bit_found = 0;
8600 if (NILP (coding_system))
8601 coding_system = Qundecided;
8602 setup_coding_system (coding_system, &coding);
8603 attrs = CODING_ID_ATTRS (coding.id);
8604 eol_type = CODING_ID_EOL_TYPE (coding.id);
8605 coding_system = CODING_ATTR_BASE_NAME (attrs);
8607 coding.source = src;
8608 coding.src_chars = src_chars;
8609 coding.src_bytes = src_bytes;
8610 coding.src_multibyte = multibytep;
8611 coding.consumed = 0;
8612 coding.mode |= CODING_MODE_LAST_BLOCK;
8613 coding.head_ascii = 0;
8615 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8617 /* At first, detect text-format if necessary. */
8618 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8619 if (base_category == coding_category_undecided)
8621 enum coding_category category IF_LINT (= 0);
8622 struct coding_system *this IF_LINT (= NULL);
8623 int c, i;
8624 bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8625 inhibit_null_byte_detection);
8626 bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8627 inhibit_iso_escape_detection);
8628 bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8630 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8631 for (; src < src_end; src++)
8633 c = *src;
8634 if (c & 0x80)
8636 eight_bit_found = 1;
8637 if (null_byte_found)
8638 break;
8640 else if (c < 0x20)
8642 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8643 && ! inhibit_ied
8644 && ! detect_info.checked)
8646 if (detect_coding_iso_2022 (&coding, &detect_info))
8648 /* We have scanned the whole data. */
8649 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8651 /* We didn't find an 8-bit code. We may
8652 have found a null-byte, but it's very
8653 rare that a binary file confirm to
8654 ISO-2022. */
8655 src = src_end;
8656 coding.head_ascii = src - coding.source;
8658 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8659 break;
8662 else if (! c && !inhibit_nbd)
8664 null_byte_found = 1;
8665 if (eight_bit_found)
8666 break;
8668 if (! eight_bit_found)
8669 coding.head_ascii++;
8671 else if (! eight_bit_found)
8672 coding.head_ascii++;
8675 if (null_byte_found || eight_bit_found
8676 || coding.head_ascii < coding.src_bytes
8677 || detect_info.found)
8679 if (coding.head_ascii == coding.src_bytes)
8680 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8681 for (i = 0; i < coding_category_raw_text; i++)
8683 category = coding_priorities[i];
8684 this = coding_categories + category;
8685 if (detect_info.found & (1 << category))
8686 break;
8688 else
8690 if (null_byte_found)
8692 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8693 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8695 else if (prefer_utf_8
8696 && detect_coding_utf_8 (&coding, &detect_info))
8698 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8699 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8701 for (i = 0; i < coding_category_raw_text; i++)
8703 category = coding_priorities[i];
8704 this = coding_categories + category;
8706 if (this->id < 0)
8708 /* No coding system of this category is defined. */
8709 detect_info.rejected |= (1 << category);
8711 else if (category >= coding_category_raw_text)
8712 continue;
8713 else if (detect_info.checked & (1 << category))
8715 if (highest
8716 && (detect_info.found & (1 << category)))
8717 break;
8719 else if ((*(this->detector)) (&coding, &detect_info)
8720 && highest
8721 && (detect_info.found & (1 << category)))
8723 if (category == coding_category_utf_16_auto)
8725 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8726 category = coding_category_utf_16_le;
8727 else
8728 category = coding_category_utf_16_be;
8730 break;
8736 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8737 || null_byte_found)
8739 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8740 id = CODING_SYSTEM_ID (Qno_conversion);
8741 val = list1 (make_number (id));
8743 else if (! detect_info.rejected && ! detect_info.found)
8745 detect_info.found = CATEGORY_MASK_ANY;
8746 id = coding_categories[coding_category_undecided].id;
8747 val = list1 (make_number (id));
8749 else if (highest)
8751 if (detect_info.found)
8753 detect_info.found = 1 << category;
8754 val = list1 (make_number (this->id));
8756 else
8757 for (i = 0; i < coding_category_raw_text; i++)
8758 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8760 detect_info.found = 1 << coding_priorities[i];
8761 id = coding_categories[coding_priorities[i]].id;
8762 val = list1 (make_number (id));
8763 break;
8766 else
8768 int mask = detect_info.rejected | detect_info.found;
8769 int found = 0;
8771 for (i = coding_category_raw_text - 1; i >= 0; i--)
8773 category = coding_priorities[i];
8774 if (! (mask & (1 << category)))
8776 found |= 1 << category;
8777 id = coding_categories[category].id;
8778 if (id >= 0)
8779 val = list1 (make_number (id));
8782 for (i = coding_category_raw_text - 1; i >= 0; i--)
8784 category = coding_priorities[i];
8785 if (detect_info.found & (1 << category))
8787 id = coding_categories[category].id;
8788 val = Fcons (make_number (id), val);
8791 detect_info.found |= found;
8794 else if (base_category == coding_category_utf_8_auto)
8796 if (detect_coding_utf_8 (&coding, &detect_info))
8798 struct coding_system *this;
8800 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8801 this = coding_categories + coding_category_utf_8_sig;
8802 else
8803 this = coding_categories + coding_category_utf_8_nosig;
8804 val = list1 (make_number (this->id));
8807 else if (base_category == coding_category_utf_16_auto)
8809 if (detect_coding_utf_16 (&coding, &detect_info))
8811 struct coding_system *this;
8813 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8814 this = coding_categories + coding_category_utf_16_le;
8815 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8816 this = coding_categories + coding_category_utf_16_be;
8817 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8818 this = coding_categories + coding_category_utf_16_be_nosig;
8819 else
8820 this = coding_categories + coding_category_utf_16_le_nosig;
8821 val = list1 (make_number (this->id));
8824 else
8826 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8827 val = list1 (make_number (coding.id));
8830 /* Then, detect eol-format if necessary. */
8832 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8833 Lisp_Object tail;
8835 if (VECTORP (eol_type))
8837 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8839 if (null_byte_found)
8840 normal_eol = EOL_SEEN_LF;
8841 else
8842 normal_eol = detect_eol (coding.source, src_bytes,
8843 coding_category_raw_text);
8845 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8846 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8847 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8848 coding_category_utf_16_be);
8849 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8850 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8851 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8852 coding_category_utf_16_le);
8854 else
8856 if (EQ (eol_type, Qunix))
8857 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8858 else if (EQ (eol_type, Qdos))
8859 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8860 else
8861 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8864 for (tail = val; CONSP (tail); tail = XCDR (tail))
8866 enum coding_category category;
8867 int this_eol;
8869 id = XINT (XCAR (tail));
8870 attrs = CODING_ID_ATTRS (id);
8871 category = XINT (CODING_ATTR_CATEGORY (attrs));
8872 eol_type = CODING_ID_EOL_TYPE (id);
8873 if (VECTORP (eol_type))
8875 if (category == coding_category_utf_16_be
8876 || category == coding_category_utf_16_be_nosig)
8877 this_eol = utf_16_be_eol;
8878 else if (category == coding_category_utf_16_le
8879 || category == coding_category_utf_16_le_nosig)
8880 this_eol = utf_16_le_eol;
8881 else
8882 this_eol = normal_eol;
8884 if (this_eol == EOL_SEEN_LF)
8885 XSETCAR (tail, AREF (eol_type, 0));
8886 else if (this_eol == EOL_SEEN_CRLF)
8887 XSETCAR (tail, AREF (eol_type, 1));
8888 else if (this_eol == EOL_SEEN_CR)
8889 XSETCAR (tail, AREF (eol_type, 2));
8890 else
8891 XSETCAR (tail, CODING_ID_NAME (id));
8893 else
8894 XSETCAR (tail, CODING_ID_NAME (id));
8898 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8902 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8903 2, 3, 0,
8904 doc: /* Detect coding system of the text in the region between START and END.
8905 Return a list of possible coding systems ordered by priority.
8906 The coding systems to try and their priorities follows what
8907 the function `coding-system-priority-list' (which see) returns.
8909 If only ASCII characters are found (except for such ISO-2022 control
8910 characters as ESC), it returns a list of single element `undecided'
8911 or its subsidiary coding system according to a detected end-of-line
8912 format.
8914 If optional argument HIGHEST is non-nil, return the coding system of
8915 highest priority. */)
8916 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8918 ptrdiff_t from, to;
8919 ptrdiff_t from_byte, to_byte;
8921 validate_region (&start, &end);
8922 from = XINT (start), to = XINT (end);
8923 from_byte = CHAR_TO_BYTE (from);
8924 to_byte = CHAR_TO_BYTE (to);
8926 if (from < GPT && to >= GPT)
8927 move_gap_both (to, to_byte);
8929 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8930 to - from, to_byte - from_byte,
8931 !NILP (highest),
8932 !NILP (BVAR (current_buffer
8933 , enable_multibyte_characters)),
8934 Qnil);
8937 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8938 1, 2, 0,
8939 doc: /* Detect coding system of the text in STRING.
8940 Return a list of possible coding systems ordered by priority.
8941 The coding systems to try and their priorities follows what
8942 the function `coding-system-priority-list' (which see) returns.
8944 If only ASCII characters are found (except for such ISO-2022 control
8945 characters as ESC), it returns a list of single element `undecided'
8946 or its subsidiary coding system according to a detected end-of-line
8947 format.
8949 If optional argument HIGHEST is non-nil, return the coding system of
8950 highest priority. */)
8951 (Lisp_Object string, Lisp_Object highest)
8953 CHECK_STRING (string);
8955 return detect_coding_system (SDATA (string),
8956 SCHARS (string), SBYTES (string),
8957 !NILP (highest), STRING_MULTIBYTE (string),
8958 Qnil);
8962 static bool
8963 char_encodable_p (int c, Lisp_Object attrs)
8965 Lisp_Object tail;
8966 struct charset *charset;
8967 Lisp_Object translation_table;
8969 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8970 if (! NILP (translation_table))
8971 c = translate_char (translation_table, c);
8972 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8973 CONSP (tail); tail = XCDR (tail))
8975 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8976 if (CHAR_CHARSET_P (c, charset))
8977 break;
8979 return (! NILP (tail));
8983 /* Return a list of coding systems that safely encode the text between
8984 START and END. If EXCLUDE is non-nil, it is a list of coding
8985 systems not to check. The returned list doesn't contain any such
8986 coding systems. In any case, if the text contains only ASCII or is
8987 unibyte, return t. */
8989 DEFUN ("find-coding-systems-region-internal",
8990 Ffind_coding_systems_region_internal,
8991 Sfind_coding_systems_region_internal, 2, 3, 0,
8992 doc: /* Internal use only. */)
8993 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8995 Lisp_Object coding_attrs_list, safe_codings;
8996 ptrdiff_t start_byte, end_byte;
8997 const unsigned char *p, *pbeg, *pend;
8998 int c;
8999 Lisp_Object tail, elt, work_table;
9001 if (STRINGP (start))
9003 if (!STRING_MULTIBYTE (start)
9004 || SCHARS (start) == SBYTES (start))
9005 return Qt;
9006 start_byte = 0;
9007 end_byte = SBYTES (start);
9009 else
9011 CHECK_NUMBER_COERCE_MARKER (start);
9012 CHECK_NUMBER_COERCE_MARKER (end);
9013 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9014 args_out_of_range (start, end);
9015 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9016 return Qt;
9017 start_byte = CHAR_TO_BYTE (XINT (start));
9018 end_byte = CHAR_TO_BYTE (XINT (end));
9019 if (XINT (end) - XINT (start) == end_byte - start_byte)
9020 return Qt;
9022 if (XINT (start) < GPT && XINT (end) > GPT)
9024 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9025 move_gap_both (XINT (start), start_byte);
9026 else
9027 move_gap_both (XINT (end), end_byte);
9031 coding_attrs_list = Qnil;
9032 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9033 if (NILP (exclude)
9034 || NILP (Fmemq (XCAR (tail), exclude)))
9036 Lisp_Object attrs;
9038 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9039 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9041 ASET (attrs, coding_attr_trans_tbl,
9042 get_translation_table (attrs, 1, NULL));
9043 coding_attrs_list = Fcons (attrs, coding_attrs_list);
9047 if (STRINGP (start))
9048 p = pbeg = SDATA (start);
9049 else
9050 p = pbeg = BYTE_POS_ADDR (start_byte);
9051 pend = p + (end_byte - start_byte);
9053 while (p < pend && ASCII_BYTE_P (*p)) p++;
9054 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9056 work_table = Fmake_char_table (Qnil, Qnil);
9057 while (p < pend)
9059 if (ASCII_BYTE_P (*p))
9060 p++;
9061 else
9063 c = STRING_CHAR_ADVANCE (p);
9064 if (!NILP (char_table_ref (work_table, c)))
9065 /* This character was already checked. Ignore it. */
9066 continue;
9068 charset_map_loaded = 0;
9069 for (tail = coding_attrs_list; CONSP (tail);)
9071 elt = XCAR (tail);
9072 if (NILP (elt))
9073 tail = XCDR (tail);
9074 else if (char_encodable_p (c, elt))
9075 tail = XCDR (tail);
9076 else if (CONSP (XCDR (tail)))
9078 XSETCAR (tail, XCAR (XCDR (tail)));
9079 XSETCDR (tail, XCDR (XCDR (tail)));
9081 else
9083 XSETCAR (tail, Qnil);
9084 tail = XCDR (tail);
9087 if (charset_map_loaded)
9089 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9091 if (STRINGP (start))
9092 pbeg = SDATA (start);
9093 else
9094 pbeg = BYTE_POS_ADDR (start_byte);
9095 p = pbeg + p_offset;
9096 pend = pbeg + pend_offset;
9098 char_table_set (work_table, c, Qt);
9102 safe_codings = list2 (Qraw_text, Qno_conversion);
9103 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9104 if (! NILP (XCAR (tail)))
9105 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9107 return safe_codings;
9111 DEFUN ("unencodable-char-position", Funencodable_char_position,
9112 Sunencodable_char_position, 3, 5, 0,
9113 doc: /*
9114 Return position of first un-encodable character in a region.
9115 START and END specify the region and CODING-SYSTEM specifies the
9116 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9118 If optional 4th argument COUNT is non-nil, it specifies at most how
9119 many un-encodable characters to search. In this case, the value is a
9120 list of positions.
9122 If optional 5th argument STRING is non-nil, it is a string to search
9123 for un-encodable characters. In that case, START and END are indexes
9124 to the string. */)
9125 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9127 EMACS_INT n;
9128 struct coding_system coding;
9129 Lisp_Object attrs, charset_list, translation_table;
9130 Lisp_Object positions;
9131 ptrdiff_t from, to;
9132 const unsigned char *p, *stop, *pend;
9133 bool ascii_compatible;
9135 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9136 attrs = CODING_ID_ATTRS (coding.id);
9137 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9138 return Qnil;
9139 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9140 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9141 translation_table = get_translation_table (attrs, 1, NULL);
9143 if (NILP (string))
9145 validate_region (&start, &end);
9146 from = XINT (start);
9147 to = XINT (end);
9148 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9149 || (ascii_compatible
9150 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9151 return Qnil;
9152 p = CHAR_POS_ADDR (from);
9153 pend = CHAR_POS_ADDR (to);
9154 if (from < GPT && to >= GPT)
9155 stop = GPT_ADDR;
9156 else
9157 stop = pend;
9159 else
9161 CHECK_STRING (string);
9162 CHECK_NATNUM (start);
9163 CHECK_NATNUM (end);
9164 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9165 args_out_of_range_3 (string, start, end);
9166 from = XINT (start);
9167 to = XINT (end);
9168 if (! STRING_MULTIBYTE (string))
9169 return Qnil;
9170 p = SDATA (string) + string_char_to_byte (string, from);
9171 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9172 if (ascii_compatible && (to - from) == (pend - p))
9173 return Qnil;
9176 if (NILP (count))
9177 n = 1;
9178 else
9180 CHECK_NATNUM (count);
9181 n = XINT (count);
9184 positions = Qnil;
9185 charset_map_loaded = 0;
9186 while (1)
9188 int c;
9190 if (ascii_compatible)
9191 while (p < stop && ASCII_BYTE_P (*p))
9192 p++, from++;
9193 if (p >= stop)
9195 if (p >= pend)
9196 break;
9197 stop = pend;
9198 p = GAP_END_ADDR;
9201 c = STRING_CHAR_ADVANCE (p);
9202 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9203 && ! char_charset (translate_char (translation_table, c),
9204 charset_list, NULL))
9206 positions = Fcons (make_number (from), positions);
9207 n--;
9208 if (n == 0)
9209 break;
9212 from++;
9213 if (charset_map_loaded && NILP (string))
9215 p = CHAR_POS_ADDR (from);
9216 pend = CHAR_POS_ADDR (to);
9217 if (from < GPT && to >= GPT)
9218 stop = GPT_ADDR;
9219 else
9220 stop = pend;
9221 charset_map_loaded = 0;
9225 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9230 Scheck_coding_systems_region, 3, 3, 0,
9231 doc: /* Check if the region is encodable by coding systems.
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9242 value is nil.
9244 START may be a string. In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions. END is ignored.
9248 If the current buffer (or START if it is a string) is unibyte, the value
9249 is nil. */)
9250 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9252 Lisp_Object list;
9253 ptrdiff_t start_byte, end_byte;
9254 ptrdiff_t pos;
9255 const unsigned char *p, *pbeg, *pend;
9256 int c;
9257 Lisp_Object tail, elt, attrs;
9259 if (STRINGP (start))
9261 if (!STRING_MULTIBYTE (start)
9262 || SCHARS (start) == SBYTES (start))
9263 return Qnil;
9264 start_byte = 0;
9265 end_byte = SBYTES (start);
9266 pos = 0;
9268 else
9270 CHECK_NUMBER_COERCE_MARKER (start);
9271 CHECK_NUMBER_COERCE_MARKER (end);
9272 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9273 args_out_of_range (start, end);
9274 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9275 return Qnil;
9276 start_byte = CHAR_TO_BYTE (XINT (start));
9277 end_byte = CHAR_TO_BYTE (XINT (end));
9278 if (XINT (end) - XINT (start) == end_byte - start_byte)
9279 return Qnil;
9281 if (XINT (start) < GPT && XINT (end) > GPT)
9283 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9284 move_gap_both (XINT (start), start_byte);
9285 else
9286 move_gap_both (XINT (end), end_byte);
9288 pos = XINT (start);
9291 list = Qnil;
9292 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9294 elt = XCAR (tail);
9295 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9296 ASET (attrs, coding_attr_trans_tbl,
9297 get_translation_table (attrs, 1, NULL));
9298 list = Fcons (list2 (elt, attrs), list);
9301 if (STRINGP (start))
9302 p = pbeg = SDATA (start);
9303 else
9304 p = pbeg = BYTE_POS_ADDR (start_byte);
9305 pend = p + (end_byte - start_byte);
9307 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9308 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9310 while (p < pend)
9312 if (ASCII_BYTE_P (*p))
9313 p++;
9314 else
9316 c = STRING_CHAR_ADVANCE (p);
9318 charset_map_loaded = 0;
9319 for (tail = list; CONSP (tail); tail = XCDR (tail))
9321 elt = XCDR (XCAR (tail));
9322 if (! char_encodable_p (c, XCAR (elt)))
9323 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9325 if (charset_map_loaded)
9327 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9329 if (STRINGP (start))
9330 pbeg = SDATA (start);
9331 else
9332 pbeg = BYTE_POS_ADDR (start_byte);
9333 p = pbeg + p_offset;
9334 pend = pbeg + pend_offset;
9337 pos++;
9340 tail = list;
9341 list = Qnil;
9342 for (; CONSP (tail); tail = XCDR (tail))
9344 elt = XCAR (tail);
9345 if (CONSP (XCDR (XCDR (elt))))
9346 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9347 list);
9350 return list;
9354 static Lisp_Object
9355 code_convert_region (Lisp_Object start, Lisp_Object end,
9356 Lisp_Object coding_system, Lisp_Object dst_object,
9357 bool encodep, bool norecord)
9359 struct coding_system coding;
9360 ptrdiff_t from, from_byte, to, to_byte;
9361 Lisp_Object src_object;
9363 if (NILP (coding_system))
9364 coding_system = Qno_conversion;
9365 else
9366 CHECK_CODING_SYSTEM (coding_system);
9367 src_object = Fcurrent_buffer ();
9368 if (NILP (dst_object))
9369 dst_object = src_object;
9370 else if (! EQ (dst_object, Qt))
9371 CHECK_BUFFER (dst_object);
9373 validate_region (&start, &end);
9374 from = XFASTINT (start);
9375 from_byte = CHAR_TO_BYTE (from);
9376 to = XFASTINT (end);
9377 to_byte = CHAR_TO_BYTE (to);
9379 setup_coding_system (coding_system, &coding);
9380 coding.mode |= CODING_MODE_LAST_BLOCK;
9382 if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9384 struct buffer *buf = XBUFFER (dst_object);
9385 ptrdiff_t buf_pt = BUF_PT (buf);
9387 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9390 if (encodep)
9391 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392 dst_object);
9393 else
9394 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9395 dst_object);
9396 if (! norecord)
9397 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9399 return (BUFFERP (dst_object)
9400 ? make_number (coding.produced_char)
9401 : coding.dst_object);
9405 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9406 3, 4, "r\nzCoding system: ",
9407 doc: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409 START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9415 does not move).
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.) */)
9422 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9424 return code_convert_region (start, end, coding_system, destination, 0, 0);
9427 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9428 3, 4, "r\nzCoding system: ",
9429 doc: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431 START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9434 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replace by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9437 does not move).
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.) */)
9444 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9446 return code_convert_region (start, end, coding_system, destination, 1, 0);
9449 Lisp_Object
9450 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9451 Lisp_Object dst_object, bool encodep, bool nocopy,
9452 bool norecord)
9454 struct coding_system coding;
9455 ptrdiff_t chars, bytes;
9457 CHECK_STRING (string);
9458 if (NILP (coding_system))
9460 if (! norecord)
9461 Vlast_coding_system_used = Qno_conversion;
9462 if (NILP (dst_object))
9463 return (nocopy ? Fcopy_sequence (string) : string);
9466 if (NILP (coding_system))
9467 coding_system = Qno_conversion;
9468 else
9469 CHECK_CODING_SYSTEM (coding_system);
9470 if (NILP (dst_object))
9471 dst_object = Qt;
9472 else if (! EQ (dst_object, Qt))
9473 CHECK_BUFFER (dst_object);
9475 setup_coding_system (coding_system, &coding);
9476 coding.mode |= CODING_MODE_LAST_BLOCK;
9477 chars = SCHARS (string);
9478 bytes = SBYTES (string);
9480 if (BUFFERP (dst_object))
9482 struct buffer *buf = XBUFFER (dst_object);
9483 ptrdiff_t buf_pt = BUF_PT (buf);
9485 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9488 if (encodep)
9489 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9490 else
9491 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9492 if (! norecord)
9493 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9495 return (BUFFERP (dst_object)
9496 ? make_number (coding.produced_char)
9497 : coding.dst_object);
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502 Do not set Vlast_coding_system_used.
9504 This function is called only from macros DECODE_FILE and
9505 ENCODE_FILE, thus we ignore character composition. */
9507 Lisp_Object
9508 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9509 bool encodep)
9511 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515 for passing to C library functions. */
9516 Lisp_Object
9517 decode_file_name (Lisp_Object fname)
9519 #ifdef WINDOWSNT
9520 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521 converts the file names either to UTF-16LE or to the system ANSI
9522 codepage internally, depending on the underlying OS; see w32.c. */
9523 if (! NILP (Fcoding_system_p (Qutf_8)))
9524 return code_convert_string_norecord (fname, Qutf_8, 0);
9525 return fname;
9526 #else /* !WINDOWSNT */
9527 if (! NILP (Vfile_name_coding_system))
9528 return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9529 else if (! NILP (Vdefault_file_name_coding_system))
9530 return code_convert_string_norecord (fname,
9531 Vdefault_file_name_coding_system, 0);
9532 else
9533 return fname;
9534 #endif
9537 Lisp_Object
9538 encode_file_name (Lisp_Object fname)
9540 /* This is especially important during bootstrap and dumping, when
9541 file-name encoding is not yet known, and therefore any non-ASCII
9542 file names are unibyte strings, and could only be thrashed if we
9543 try to encode them. */
9544 if (!STRING_MULTIBYTE (fname))
9545 return fname;
9546 #ifdef WINDOWSNT
9547 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548 converts the file names either to UTF-16LE or to the system ANSI
9549 codepage internally, depending on the underlying OS; see w32.c. */
9550 if (! NILP (Fcoding_system_p (Qutf_8)))
9551 return code_convert_string_norecord (fname, Qutf_8, 1);
9552 return fname;
9553 #else /* !WINDOWSNT */
9554 if (! NILP (Vfile_name_coding_system))
9555 return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9556 else if (! NILP (Vdefault_file_name_coding_system))
9557 return code_convert_string_norecord (fname,
9558 Vdefault_file_name_coding_system, 1);
9559 else
9560 return fname;
9561 #endif
9564 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9565 2, 4, 0,
9566 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move). In this
9573 case, the return value is the length of the decoded text.
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.) */)
9578 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9580 return code_convert_string (string, coding_system, buffer,
9581 0, ! NILP (nocopy), 0);
9584 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9585 2, 4, 0,
9586 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move). In this
9593 case, the return value is the length of the encoded text.
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.) */)
9598 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9600 return code_convert_string (string, coding_system, buffer,
9601 1, ! NILP (nocopy), 0);
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9606 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character. */)
9608 (Lisp_Object code)
9610 Lisp_Object spec, attrs, val;
9611 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9612 EMACS_INT ch;
9613 int c;
9615 CHECK_NATNUM (code);
9616 ch = XFASTINT (code);
9617 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618 attrs = AREF (spec, 0);
9620 if (ASCII_BYTE_P (ch)
9621 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622 return code;
9624 val = CODING_ATTR_CHARSET_LIST (attrs);
9625 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9627 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9629 if (ch <= 0x7F)
9631 c = ch;
9632 charset = charset_roman;
9634 else if (ch >= 0xA0 && ch < 0xDF)
9636 c = ch - 0x80;
9637 charset = charset_kana;
9639 else
9641 EMACS_INT c1 = ch >> 8;
9642 int c2 = ch & 0xFF;
9644 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9645 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9646 error ("Invalid code: %"pI"d", ch);
9647 c = ch;
9648 SJIS_TO_JIS (c);
9649 charset = charset_kanji;
9651 c = DECODE_CHAR (charset, c);
9652 if (c < 0)
9653 error ("Invalid code: %"pI"d", ch);
9654 return make_number (c);
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9659 doc: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS. */)
9661 (Lisp_Object ch)
9663 Lisp_Object spec, attrs, charset_list;
9664 int c;
9665 struct charset *charset;
9666 unsigned code;
9668 CHECK_CHARACTER (ch);
9669 c = XFASTINT (ch);
9670 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9671 attrs = AREF (spec, 0);
9673 if (ASCII_CHAR_P (c)
9674 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9675 return ch;
9677 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9678 charset = char_charset (c, charset_list, &code);
9679 if (code == CHARSET_INVALID_CODE (charset))
9680 error ("Can't encode by shift_jis encoding: %c", c);
9681 JIS_TO_SJIS (code);
9683 return make_number (code);
9686 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9687 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character. */)
9689 (Lisp_Object code)
9691 Lisp_Object spec, attrs, val;
9692 struct charset *charset_roman, *charset_big5, *charset;
9693 EMACS_INT ch;
9694 int c;
9696 CHECK_NATNUM (code);
9697 ch = XFASTINT (code);
9698 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699 attrs = AREF (spec, 0);
9701 if (ASCII_BYTE_P (ch)
9702 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9703 return code;
9705 val = CODING_ATTR_CHARSET_LIST (attrs);
9706 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9707 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9709 if (ch <= 0x7F)
9711 c = ch;
9712 charset = charset_roman;
9714 else
9716 EMACS_INT b1 = ch >> 8;
9717 int b2 = ch & 0x7F;
9718 if (b1 < 0xA1 || b1 > 0xFE
9719 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9720 error ("Invalid code: %"pI"d", ch);
9721 c = ch;
9722 charset = charset_big5;
9724 c = DECODE_CHAR (charset, c);
9725 if (c < 0)
9726 error ("Invalid code: %"pI"d", ch);
9727 return make_number (c);
9730 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9731 doc: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5. */)
9733 (Lisp_Object ch)
9735 Lisp_Object spec, attrs, charset_list;
9736 struct charset *charset;
9737 int c;
9738 unsigned code;
9740 CHECK_CHARACTER (ch);
9741 c = XFASTINT (ch);
9742 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9743 attrs = AREF (spec, 0);
9744 if (ASCII_CHAR_P (c)
9745 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9746 return ch;
9748 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9749 charset = char_charset (c, charset_list, &code);
9750 if (code == CHARSET_INVALID_CODE (charset))
9751 error ("Can't encode by Big5 encoding: %c", c);
9753 return make_number (code);
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9758 Sset_terminal_coding_system_internal, 1, 2, 0,
9759 doc: /* Internal use only. */)
9760 (Lisp_Object coding_system, Lisp_Object terminal)
9762 struct terminal *term = get_terminal (terminal, 1);
9763 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9764 CHECK_SYMBOL (coding_system);
9765 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9766 /* We had better not send unsafe characters to terminal. */
9767 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9768 /* Character composition should be disabled. */
9769 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770 terminal_coding->src_multibyte = 1;
9771 terminal_coding->dst_multibyte = 0;
9772 tset_charset_list
9773 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9774 ? coding_charset_list (terminal_coding)
9775 : list1 (make_number (charset_ascii))));
9776 return Qnil;
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780 Fset_safe_terminal_coding_system_internal,
9781 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9782 doc: /* Internal use only. */)
9783 (Lisp_Object coding_system)
9785 CHECK_SYMBOL (coding_system);
9786 setup_coding_system (Fcheck_coding_system (coding_system),
9787 &safe_terminal_coding);
9788 /* Character composition should be disabled. */
9789 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9790 safe_terminal_coding.src_multibyte = 1;
9791 safe_terminal_coding.dst_multibyte = 0;
9792 return Qnil;
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9796 Sterminal_coding_system, 0, 1, 0,
9797 doc: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device. */)
9800 (Lisp_Object terminal)
9802 struct coding_system *terminal_coding
9803 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9804 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9806 /* For backward compatibility, return nil if it is `undecided'. */
9807 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9811 Sset_keyboard_coding_system_internal, 1, 2, 0,
9812 doc: /* Internal use only. */)
9813 (Lisp_Object coding_system, Lisp_Object terminal)
9815 struct terminal *t = get_terminal (terminal, 1);
9816 CHECK_SYMBOL (coding_system);
9817 if (NILP (coding_system))
9818 coding_system = Qno_conversion;
9819 else
9820 Fcheck_coding_system (coding_system);
9821 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9822 /* Character composition should be disabled. */
9823 TERMINAL_KEYBOARD_CODING (t)->common_flags
9824 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9825 return Qnil;
9828 DEFUN ("keyboard-coding-system",
9829 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9830 doc: /* Return coding system specified for decoding keyboard input. */)
9831 (Lisp_Object terminal)
9833 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834 (get_terminal (terminal, 1))->id);
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9839 Sfind_operation_coding_system, 1, MANY, 0,
9840 doc: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9846 The first argument OPERATION specifies an I/O primitive:
9847 For file I/O, `insert-file-contents' or `write-region'.
9848 For process I/O, `call-process', `call-process-region', or `start-process'.
9849 For network I/O, `open-network-stream'.
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive. Depending on which primitive, one of those arguments
9853 is selected as the TARGET. For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9856 TARGET has a meaning which depends on OPERATION:
9857 For file I/O, TARGET is a file name (except for the special case below).
9858 For process I/O, TARGET is a process name.
9859 For network I/O, TARGET is a service name or a port number.
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded). If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9879 (ptrdiff_t nargs, Lisp_Object *args)
9881 Lisp_Object operation, target_idx, target, val;
9882 register Lisp_Object chain;
9884 if (nargs < 2)
9885 error ("Too few arguments");
9886 operation = args[0];
9887 if (!SYMBOLP (operation)
9888 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9889 error ("Invalid first argument");
9890 if (nargs <= 1 + XFASTINT (target_idx))
9891 error ("Too few arguments for operation `%s'",
9892 SDATA (SYMBOL_NAME (operation)));
9893 target = args[XFASTINT (target_idx) + 1];
9894 if (!(STRINGP (target)
9895 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9896 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9897 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9898 error ("Invalid argument %"pI"d of operation `%s'",
9899 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9900 if (CONSP (target))
9901 target = XCAR (target);
9903 chain = ((EQ (operation, Qinsert_file_contents)
9904 || EQ (operation, Qwrite_region))
9905 ? Vfile_coding_system_alist
9906 : (EQ (operation, Qopen_network_stream)
9907 ? Vnetwork_coding_system_alist
9908 : Vprocess_coding_system_alist));
9909 if (NILP (chain))
9910 return Qnil;
9912 for (; CONSP (chain); chain = XCDR (chain))
9914 Lisp_Object elt;
9916 elt = XCAR (chain);
9917 if (CONSP (elt)
9918 && ((STRINGP (target)
9919 && STRINGP (XCAR (elt))
9920 && fast_string_match (XCAR (elt), target) >= 0)
9921 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9923 val = XCDR (elt);
9924 /* Here, if VAL is both a valid coding system and a valid
9925 function symbol, we return VAL as a coding system. */
9926 if (CONSP (val))
9927 return val;
9928 if (! SYMBOLP (val))
9929 return Qnil;
9930 if (! NILP (Fcoding_system_p (val)))
9931 return Fcons (val, val);
9932 if (! NILP (Ffboundp (val)))
9934 /* We use call1 rather than safe_call1
9935 so as to get bug reports about functions called here
9936 which don't handle the current interface. */
9937 val = call1 (val, Flist (nargs, args));
9938 if (CONSP (val))
9939 return val;
9940 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9941 return Fcons (val, val);
9943 return Qnil;
9946 return Qnil;
9949 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9950 Sset_coding_system_priority, 0, MANY, 0,
9951 doc: /* Assign higher priority to the coding systems given as arguments.
9952 If multiple coding systems belong to the same category,
9953 all but the first one are ignored.
9955 usage: (set-coding-system-priority &rest coding-systems) */)
9956 (ptrdiff_t nargs, Lisp_Object *args)
9958 ptrdiff_t i, j;
9959 bool changed[coding_category_max];
9960 enum coding_category priorities[coding_category_max];
9962 memset (changed, 0, sizeof changed);
9964 for (i = j = 0; i < nargs; i++)
9966 enum coding_category category;
9967 Lisp_Object spec, attrs;
9969 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9970 attrs = AREF (spec, 0);
9971 category = XINT (CODING_ATTR_CATEGORY (attrs));
9972 if (changed[category])
9973 /* Ignore this coding system because a coding system of the
9974 same category already had a higher priority. */
9975 continue;
9976 changed[category] = 1;
9977 priorities[j++] = category;
9978 if (coding_categories[category].id >= 0
9979 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9980 setup_coding_system (args[i], &coding_categories[category]);
9981 Fset (AREF (Vcoding_category_table, category), args[i]);
9984 /* Now we have decided top J priorities. Reflect the order of the
9985 original priorities to the remaining priorities. */
9987 for (i = j, j = 0; i < coding_category_max; i++, j++)
9989 while (j < coding_category_max
9990 && changed[coding_priorities[j]])
9991 j++;
9992 if (j == coding_category_max)
9993 emacs_abort ();
9994 priorities[i] = coding_priorities[j];
9997 memcpy (coding_priorities, priorities, sizeof priorities);
9999 /* Update `coding-category-list'. */
10000 Vcoding_category_list = Qnil;
10001 for (i = coding_category_max; i-- > 0; )
10002 Vcoding_category_list
10003 = Fcons (AREF (Vcoding_category_table, priorities[i]),
10004 Vcoding_category_list);
10006 return Qnil;
10009 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10010 Scoding_system_priority_list, 0, 1, 0,
10011 doc: /* Return a list of coding systems ordered by their priorities.
10012 The list contains a subset of coding systems; i.e. coding systems
10013 assigned to each coding category (see `coding-category-list').
10015 HIGHESTP non-nil means just return the highest priority one. */)
10016 (Lisp_Object highestp)
10018 int i;
10019 Lisp_Object val;
10021 for (i = 0, val = Qnil; i < coding_category_max; i++)
10023 enum coding_category category = coding_priorities[i];
10024 int id = coding_categories[category].id;
10025 Lisp_Object attrs;
10027 if (id < 0)
10028 continue;
10029 attrs = CODING_ID_ATTRS (id);
10030 if (! NILP (highestp))
10031 return CODING_ATTR_BASE_NAME (attrs);
10032 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10034 return Fnreverse (val);
10037 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10039 static Lisp_Object
10040 make_subsidiaries (Lisp_Object base)
10042 Lisp_Object subsidiaries;
10043 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10044 char *buf = alloca (base_name_len + 6);
10045 int i;
10047 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10048 subsidiaries = make_uninit_vector (3);
10049 for (i = 0; i < 3; i++)
10051 strcpy (buf + base_name_len, suffixes[i]);
10052 ASET (subsidiaries, i, intern (buf));
10054 return subsidiaries;
10058 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10059 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10060 doc: /* For internal use only.
10061 usage: (define-coding-system-internal ...) */)
10062 (ptrdiff_t nargs, Lisp_Object *args)
10064 Lisp_Object name;
10065 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
10066 Lisp_Object attrs; /* Vector of attributes. */
10067 Lisp_Object eol_type;
10068 Lisp_Object aliases;
10069 Lisp_Object coding_type, charset_list, safe_charsets;
10070 enum coding_category category;
10071 Lisp_Object tail, val;
10072 int max_charset_id = 0;
10073 int i;
10075 if (nargs < coding_arg_max)
10076 goto short_args;
10078 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10080 name = args[coding_arg_name];
10081 CHECK_SYMBOL (name);
10082 ASET (attrs, coding_attr_base_name, name);
10084 val = args[coding_arg_mnemonic];
10085 if (! STRINGP (val))
10086 CHECK_CHARACTER (val);
10087 ASET (attrs, coding_attr_mnemonic, val);
10089 coding_type = args[coding_arg_coding_type];
10090 CHECK_SYMBOL (coding_type);
10091 ASET (attrs, coding_attr_type, coding_type);
10093 charset_list = args[coding_arg_charset_list];
10094 if (SYMBOLP (charset_list))
10096 if (EQ (charset_list, Qiso_2022))
10098 if (! EQ (coding_type, Qiso_2022))
10099 error ("Invalid charset-list");
10100 charset_list = Viso_2022_charset_list;
10102 else if (EQ (charset_list, Qemacs_mule))
10104 if (! EQ (coding_type, Qemacs_mule))
10105 error ("Invalid charset-list");
10106 charset_list = Vemacs_mule_charset_list;
10108 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10110 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10111 error ("Invalid charset-list");
10112 if (max_charset_id < XFASTINT (XCAR (tail)))
10113 max_charset_id = XFASTINT (XCAR (tail));
10116 else
10118 charset_list = Fcopy_sequence (charset_list);
10119 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10121 struct charset *charset;
10123 val = XCAR (tail);
10124 CHECK_CHARSET_GET_CHARSET (val, charset);
10125 if (EQ (coding_type, Qiso_2022)
10126 ? CHARSET_ISO_FINAL (charset) < 0
10127 : EQ (coding_type, Qemacs_mule)
10128 ? CHARSET_EMACS_MULE_ID (charset) < 0
10129 : 0)
10130 error ("Can't handle charset `%s'",
10131 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10133 XSETCAR (tail, make_number (charset->id));
10134 if (max_charset_id < charset->id)
10135 max_charset_id = charset->id;
10138 ASET (attrs, coding_attr_charset_list, charset_list);
10140 safe_charsets = make_uninit_string (max_charset_id + 1);
10141 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10142 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10143 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10144 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10146 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10148 val = args[coding_arg_decode_translation_table];
10149 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10150 CHECK_SYMBOL (val);
10151 ASET (attrs, coding_attr_decode_tbl, val);
10153 val = args[coding_arg_encode_translation_table];
10154 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10155 CHECK_SYMBOL (val);
10156 ASET (attrs, coding_attr_encode_tbl, val);
10158 val = args[coding_arg_post_read_conversion];
10159 CHECK_SYMBOL (val);
10160 ASET (attrs, coding_attr_post_read, val);
10162 val = args[coding_arg_pre_write_conversion];
10163 CHECK_SYMBOL (val);
10164 ASET (attrs, coding_attr_pre_write, val);
10166 val = args[coding_arg_default_char];
10167 if (NILP (val))
10168 ASET (attrs, coding_attr_default_char, make_number (' '));
10169 else
10171 CHECK_CHARACTER (val);
10172 ASET (attrs, coding_attr_default_char, val);
10175 val = args[coding_arg_for_unibyte];
10176 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10178 val = args[coding_arg_plist];
10179 CHECK_LIST (val);
10180 ASET (attrs, coding_attr_plist, val);
10182 if (EQ (coding_type, Qcharset))
10184 /* Generate a lisp vector of 256 elements. Each element is nil,
10185 integer, or a list of charset IDs.
10187 If Nth element is nil, the byte code N is invalid in this
10188 coding system.
10190 If Nth element is a number NUM, N is the first byte of a
10191 charset whose ID is NUM.
10193 If Nth element is a list of charset IDs, N is the first byte
10194 of one of them. The list is sorted by dimensions of the
10195 charsets. A charset of smaller dimension comes first. */
10196 val = Fmake_vector (make_number (256), Qnil);
10198 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10200 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10201 int dim = CHARSET_DIMENSION (charset);
10202 int idx = (dim - 1) * 4;
10204 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10205 ASET (attrs, coding_attr_ascii_compat, Qt);
10207 for (i = charset->code_space[idx];
10208 i <= charset->code_space[idx + 1]; i++)
10210 Lisp_Object tmp, tmp2;
10211 int dim2;
10213 tmp = AREF (val, i);
10214 if (NILP (tmp))
10215 tmp = XCAR (tail);
10216 else if (NUMBERP (tmp))
10218 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10219 if (dim < dim2)
10220 tmp = list2 (XCAR (tail), tmp);
10221 else
10222 tmp = list2 (tmp, XCAR (tail));
10224 else
10226 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10228 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10229 if (dim < dim2)
10230 break;
10232 if (NILP (tmp2))
10233 tmp = nconc2 (tmp, list1 (XCAR (tail)));
10234 else
10236 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10237 XSETCAR (tmp2, XCAR (tail));
10240 ASET (val, i, tmp);
10243 ASET (attrs, coding_attr_charset_valids, val);
10244 category = coding_category_charset;
10246 else if (EQ (coding_type, Qccl))
10248 Lisp_Object valids;
10250 if (nargs < coding_arg_ccl_max)
10251 goto short_args;
10253 val = args[coding_arg_ccl_decoder];
10254 CHECK_CCL_PROGRAM (val);
10255 if (VECTORP (val))
10256 val = Fcopy_sequence (val);
10257 ASET (attrs, coding_attr_ccl_decoder, val);
10259 val = args[coding_arg_ccl_encoder];
10260 CHECK_CCL_PROGRAM (val);
10261 if (VECTORP (val))
10262 val = Fcopy_sequence (val);
10263 ASET (attrs, coding_attr_ccl_encoder, val);
10265 val = args[coding_arg_ccl_valids];
10266 valids = Fmake_string (make_number (256), make_number (0));
10267 for (tail = val; CONSP (tail); tail = XCDR (tail))
10269 int from, to;
10271 val = XCAR (tail);
10272 if (INTEGERP (val))
10274 if (! (0 <= XINT (val) && XINT (val) <= 255))
10275 args_out_of_range_3 (val, make_number (0), make_number (255));
10276 from = to = XINT (val);
10278 else
10280 CHECK_CONS (val);
10281 CHECK_NATNUM_CAR (val);
10282 CHECK_NUMBER_CDR (val);
10283 if (XINT (XCAR (val)) > 255)
10284 args_out_of_range_3 (XCAR (val),
10285 make_number (0), make_number (255));
10286 from = XINT (XCAR (val));
10287 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10288 args_out_of_range_3 (XCDR (val),
10289 XCAR (val), make_number (255));
10290 to = XINT (XCDR (val));
10292 for (i = from; i <= to; i++)
10293 SSET (valids, i, 1);
10295 ASET (attrs, coding_attr_ccl_valids, valids);
10297 category = coding_category_ccl;
10299 else if (EQ (coding_type, Qutf_16))
10301 Lisp_Object bom, endian;
10303 ASET (attrs, coding_attr_ascii_compat, Qnil);
10305 if (nargs < coding_arg_utf16_max)
10306 goto short_args;
10308 bom = args[coding_arg_utf16_bom];
10309 if (! NILP (bom) && ! EQ (bom, Qt))
10311 CHECK_CONS (bom);
10312 val = XCAR (bom);
10313 CHECK_CODING_SYSTEM (val);
10314 val = XCDR (bom);
10315 CHECK_CODING_SYSTEM (val);
10317 ASET (attrs, coding_attr_utf_bom, bom);
10319 endian = args[coding_arg_utf16_endian];
10320 CHECK_SYMBOL (endian);
10321 if (NILP (endian))
10322 endian = Qbig;
10323 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10324 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10325 ASET (attrs, coding_attr_utf_16_endian, endian);
10327 category = (CONSP (bom)
10328 ? coding_category_utf_16_auto
10329 : NILP (bom)
10330 ? (EQ (endian, Qbig)
10331 ? coding_category_utf_16_be_nosig
10332 : coding_category_utf_16_le_nosig)
10333 : (EQ (endian, Qbig)
10334 ? coding_category_utf_16_be
10335 : coding_category_utf_16_le));
10337 else if (EQ (coding_type, Qiso_2022))
10339 Lisp_Object initial, reg_usage, request, flags;
10341 if (nargs < coding_arg_iso2022_max)
10342 goto short_args;
10344 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10345 CHECK_VECTOR (initial);
10346 for (i = 0; i < 4; i++)
10348 val = AREF (initial, i);
10349 if (! NILP (val))
10351 struct charset *charset;
10353 CHECK_CHARSET_GET_CHARSET (val, charset);
10354 ASET (initial, i, make_number (CHARSET_ID (charset)));
10355 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10356 ASET (attrs, coding_attr_ascii_compat, Qt);
10358 else
10359 ASET (initial, i, make_number (-1));
10362 reg_usage = args[coding_arg_iso2022_reg_usage];
10363 CHECK_CONS (reg_usage);
10364 CHECK_NUMBER_CAR (reg_usage);
10365 CHECK_NUMBER_CDR (reg_usage);
10367 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10368 for (tail = request; CONSP (tail); tail = XCDR (tail))
10370 int id;
10371 Lisp_Object tmp1;
10373 val = XCAR (tail);
10374 CHECK_CONS (val);
10375 tmp1 = XCAR (val);
10376 CHECK_CHARSET_GET_ID (tmp1, id);
10377 CHECK_NATNUM_CDR (val);
10378 if (XINT (XCDR (val)) >= 4)
10379 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10380 XSETCAR (val, make_number (id));
10383 flags = args[coding_arg_iso2022_flags];
10384 CHECK_NATNUM (flags);
10385 i = XINT (flags) & INT_MAX;
10386 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10387 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10388 flags = make_number (i);
10390 ASET (attrs, coding_attr_iso_initial, initial);
10391 ASET (attrs, coding_attr_iso_usage, reg_usage);
10392 ASET (attrs, coding_attr_iso_request, request);
10393 ASET (attrs, coding_attr_iso_flags, flags);
10394 setup_iso_safe_charsets (attrs);
10396 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10397 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10398 | CODING_ISO_FLAG_SINGLE_SHIFT))
10399 ? coding_category_iso_7_else
10400 : EQ (args[coding_arg_charset_list], Qiso_2022)
10401 ? coding_category_iso_7
10402 : coding_category_iso_7_tight);
10403 else
10405 int id = XINT (AREF (initial, 1));
10407 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10408 || EQ (args[coding_arg_charset_list], Qiso_2022)
10409 || id < 0)
10410 ? coding_category_iso_8_else
10411 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10412 ? coding_category_iso_8_1
10413 : coding_category_iso_8_2);
10415 if (category != coding_category_iso_8_1
10416 && category != coding_category_iso_8_2)
10417 ASET (attrs, coding_attr_ascii_compat, Qnil);
10419 else if (EQ (coding_type, Qemacs_mule))
10421 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10422 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10423 ASET (attrs, coding_attr_ascii_compat, Qt);
10424 category = coding_category_emacs_mule;
10426 else if (EQ (coding_type, Qshift_jis))
10429 struct charset *charset;
10431 if (XINT (Flength (charset_list)) != 3
10432 && XINT (Flength (charset_list)) != 4)
10433 error ("There should be three or four charsets");
10435 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10436 if (CHARSET_DIMENSION (charset) != 1)
10437 error ("Dimension of charset %s is not one",
10438 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10439 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10440 ASET (attrs, coding_attr_ascii_compat, Qt);
10442 charset_list = XCDR (charset_list);
10443 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444 if (CHARSET_DIMENSION (charset) != 1)
10445 error ("Dimension of charset %s is not one",
10446 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10448 charset_list = XCDR (charset_list);
10449 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10450 if (CHARSET_DIMENSION (charset) != 2)
10451 error ("Dimension of charset %s is not two",
10452 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10454 charset_list = XCDR (charset_list);
10455 if (! NILP (charset_list))
10457 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10458 if (CHARSET_DIMENSION (charset) != 2)
10459 error ("Dimension of charset %s is not two",
10460 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10463 category = coding_category_sjis;
10464 Vsjis_coding_system = name;
10466 else if (EQ (coding_type, Qbig5))
10468 struct charset *charset;
10470 if (XINT (Flength (charset_list)) != 2)
10471 error ("There should be just two charsets");
10473 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10474 if (CHARSET_DIMENSION (charset) != 1)
10475 error ("Dimension of charset %s is not one",
10476 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10477 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10478 ASET (attrs, coding_attr_ascii_compat, Qt);
10480 charset_list = XCDR (charset_list);
10481 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10482 if (CHARSET_DIMENSION (charset) != 2)
10483 error ("Dimension of charset %s is not two",
10484 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10486 category = coding_category_big5;
10487 Vbig5_coding_system = name;
10489 else if (EQ (coding_type, Qraw_text))
10491 category = coding_category_raw_text;
10492 ASET (attrs, coding_attr_ascii_compat, Qt);
10494 else if (EQ (coding_type, Qutf_8))
10496 Lisp_Object bom;
10498 if (nargs < coding_arg_utf8_max)
10499 goto short_args;
10501 bom = args[coding_arg_utf8_bom];
10502 if (! NILP (bom) && ! EQ (bom, Qt))
10504 CHECK_CONS (bom);
10505 val = XCAR (bom);
10506 CHECK_CODING_SYSTEM (val);
10507 val = XCDR (bom);
10508 CHECK_CODING_SYSTEM (val);
10510 ASET (attrs, coding_attr_utf_bom, bom);
10511 if (NILP (bom))
10512 ASET (attrs, coding_attr_ascii_compat, Qt);
10514 category = (CONSP (bom) ? coding_category_utf_8_auto
10515 : NILP (bom) ? coding_category_utf_8_nosig
10516 : coding_category_utf_8_sig);
10518 else if (EQ (coding_type, Qundecided))
10520 if (nargs < coding_arg_undecided_max)
10521 goto short_args;
10522 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10523 args[coding_arg_undecided_inhibit_null_byte_detection]);
10524 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10525 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10526 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10527 args[coding_arg_undecided_prefer_utf_8]);
10528 category = coding_category_undecided;
10530 else
10531 error ("Invalid coding system type: %s",
10532 SDATA (SYMBOL_NAME (coding_type)));
10534 ASET (attrs, coding_attr_category, make_number (category));
10535 ASET (attrs, coding_attr_plist,
10536 Fcons (QCcategory,
10537 Fcons (AREF (Vcoding_category_table, category),
10538 CODING_ATTR_PLIST (attrs))));
10539 ASET (attrs, coding_attr_plist,
10540 Fcons (QCascii_compatible_p,
10541 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10542 CODING_ATTR_PLIST (attrs))));
10544 eol_type = args[coding_arg_eol_type];
10545 if (! NILP (eol_type)
10546 && ! EQ (eol_type, Qunix)
10547 && ! EQ (eol_type, Qdos)
10548 && ! EQ (eol_type, Qmac))
10549 error ("Invalid eol-type");
10551 aliases = list1 (name);
10553 if (NILP (eol_type))
10555 eol_type = make_subsidiaries (name);
10556 for (i = 0; i < 3; i++)
10558 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10560 this_name = AREF (eol_type, i);
10561 this_aliases = list1 (this_name);
10562 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10563 this_spec = make_uninit_vector (3);
10564 ASET (this_spec, 0, attrs);
10565 ASET (this_spec, 1, this_aliases);
10566 ASET (this_spec, 2, this_eol_type);
10567 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10568 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10569 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10570 if (NILP (val))
10571 Vcoding_system_alist
10572 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10573 Vcoding_system_alist);
10577 spec_vec = make_uninit_vector (3);
10578 ASET (spec_vec, 0, attrs);
10579 ASET (spec_vec, 1, aliases);
10580 ASET (spec_vec, 2, eol_type);
10582 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10583 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10584 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10585 if (NILP (val))
10586 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10587 Vcoding_system_alist);
10590 int id = coding_categories[category].id;
10592 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10593 setup_coding_system (name, &coding_categories[category]);
10596 return Qnil;
10598 short_args:
10599 return Fsignal (Qwrong_number_of_arguments,
10600 Fcons (intern ("define-coding-system-internal"),
10601 make_number (nargs)));
10605 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10606 3, 3, 0,
10607 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10608 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10610 Lisp_Object spec, attrs;
10612 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10613 attrs = AREF (spec, 0);
10614 if (EQ (prop, QCmnemonic))
10616 if (! STRINGP (val))
10617 CHECK_CHARACTER (val);
10618 ASET (attrs, coding_attr_mnemonic, val);
10620 else if (EQ (prop, QCdefault_char))
10622 if (NILP (val))
10623 val = make_number (' ');
10624 else
10625 CHECK_CHARACTER (val);
10626 ASET (attrs, coding_attr_default_char, val);
10628 else if (EQ (prop, QCdecode_translation_table))
10630 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10631 CHECK_SYMBOL (val);
10632 ASET (attrs, coding_attr_decode_tbl, val);
10634 else if (EQ (prop, QCencode_translation_table))
10636 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10637 CHECK_SYMBOL (val);
10638 ASET (attrs, coding_attr_encode_tbl, val);
10640 else if (EQ (prop, QCpost_read_conversion))
10642 CHECK_SYMBOL (val);
10643 ASET (attrs, coding_attr_post_read, val);
10645 else if (EQ (prop, QCpre_write_conversion))
10647 CHECK_SYMBOL (val);
10648 ASET (attrs, coding_attr_pre_write, val);
10650 else if (EQ (prop, QCascii_compatible_p))
10652 ASET (attrs, coding_attr_ascii_compat, val);
10655 ASET (attrs, coding_attr_plist,
10656 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10657 return val;
10661 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10662 Sdefine_coding_system_alias, 2, 2, 0,
10663 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10664 (Lisp_Object alias, Lisp_Object coding_system)
10666 Lisp_Object spec, aliases, eol_type, val;
10668 CHECK_SYMBOL (alias);
10669 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10670 aliases = AREF (spec, 1);
10671 /* ALIASES should be a list of length more than zero, and the first
10672 element is a base coding system. Append ALIAS at the tail of the
10673 list. */
10674 while (!NILP (XCDR (aliases)))
10675 aliases = XCDR (aliases);
10676 XSETCDR (aliases, list1 (alias));
10678 eol_type = AREF (spec, 2);
10679 if (VECTORP (eol_type))
10681 Lisp_Object subsidiaries;
10682 int i;
10684 subsidiaries = make_subsidiaries (alias);
10685 for (i = 0; i < 3; i++)
10686 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10687 AREF (eol_type, i));
10690 Fputhash (alias, spec, Vcoding_system_hash_table);
10691 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10692 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10693 if (NILP (val))
10694 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10695 Vcoding_system_alist);
10697 return Qnil;
10700 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10701 1, 1, 0,
10702 doc: /* Return the base of CODING-SYSTEM.
10703 Any alias or subsidiary coding system is not a base coding system. */)
10704 (Lisp_Object coding_system)
10706 Lisp_Object spec, attrs;
10708 if (NILP (coding_system))
10709 return (Qno_conversion);
10710 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10711 attrs = AREF (spec, 0);
10712 return CODING_ATTR_BASE_NAME (attrs);
10715 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10716 1, 1, 0,
10717 doc: "Return the property list of CODING-SYSTEM.")
10718 (Lisp_Object coding_system)
10720 Lisp_Object spec, attrs;
10722 if (NILP (coding_system))
10723 coding_system = Qno_conversion;
10724 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10725 attrs = AREF (spec, 0);
10726 return CODING_ATTR_PLIST (attrs);
10730 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10731 1, 1, 0,
10732 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10733 (Lisp_Object coding_system)
10735 Lisp_Object spec;
10737 if (NILP (coding_system))
10738 coding_system = Qno_conversion;
10739 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10740 return AREF (spec, 1);
10743 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10744 Scoding_system_eol_type, 1, 1, 0,
10745 doc: /* Return eol-type of CODING-SYSTEM.
10746 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10748 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10749 and CR respectively.
10751 A vector value indicates that a format of end-of-line should be
10752 detected automatically. Nth element of the vector is the subsidiary
10753 coding system whose eol-type is N. */)
10754 (Lisp_Object coding_system)
10756 Lisp_Object spec, eol_type;
10757 int n;
10759 if (NILP (coding_system))
10760 coding_system = Qno_conversion;
10761 if (! CODING_SYSTEM_P (coding_system))
10762 return Qnil;
10763 spec = CODING_SYSTEM_SPEC (coding_system);
10764 eol_type = AREF (spec, 2);
10765 if (VECTORP (eol_type))
10766 return Fcopy_sequence (eol_type);
10767 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10768 return make_number (n);
10771 #endif /* emacs */
10774 /*** 9. Post-amble ***/
10776 void
10777 init_coding_once (void)
10779 int i;
10781 for (i = 0; i < coding_category_max; i++)
10783 coding_categories[i].id = -1;
10784 coding_priorities[i] = i;
10787 /* ISO2022 specific initialize routine. */
10788 for (i = 0; i < 0x20; i++)
10789 iso_code_class[i] = ISO_control_0;
10790 for (i = 0x21; i < 0x7F; i++)
10791 iso_code_class[i] = ISO_graphic_plane_0;
10792 for (i = 0x80; i < 0xA0; i++)
10793 iso_code_class[i] = ISO_control_1;
10794 for (i = 0xA1; i < 0xFF; i++)
10795 iso_code_class[i] = ISO_graphic_plane_1;
10796 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10797 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10798 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10799 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10800 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10801 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10802 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10803 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10804 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10806 for (i = 0; i < 256; i++)
10808 emacs_mule_bytes[i] = 1;
10810 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10811 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10812 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10813 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10816 #ifdef emacs
10818 void
10819 syms_of_coding (void)
10821 staticpro (&Vcoding_system_hash_table);
10823 Lisp_Object args[2];
10824 args[0] = QCtest;
10825 args[1] = Qeq;
10826 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10829 staticpro (&Vsjis_coding_system);
10830 Vsjis_coding_system = Qnil;
10832 staticpro (&Vbig5_coding_system);
10833 Vbig5_coding_system = Qnil;
10835 staticpro (&Vcode_conversion_reused_workbuf);
10836 Vcode_conversion_reused_workbuf = Qnil;
10838 staticpro (&Vcode_conversion_workbuf_name);
10839 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10841 reused_workbuf_in_use = 0;
10843 DEFSYM (Qcharset, "charset");
10844 DEFSYM (Qtarget_idx, "target-idx");
10845 DEFSYM (Qcoding_system_history, "coding-system-history");
10846 Fset (Qcoding_system_history, Qnil);
10848 /* Target FILENAME is the first argument. */
10849 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10850 /* Target FILENAME is the third argument. */
10851 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10853 DEFSYM (Qcall_process, "call-process");
10854 /* Target PROGRAM is the first argument. */
10855 Fput (Qcall_process, Qtarget_idx, make_number (0));
10857 DEFSYM (Qcall_process_region, "call-process-region");
10858 /* Target PROGRAM is the third argument. */
10859 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10861 DEFSYM (Qstart_process, "start-process");
10862 /* Target PROGRAM is the third argument. */
10863 Fput (Qstart_process, Qtarget_idx, make_number (2));
10865 DEFSYM (Qopen_network_stream, "open-network-stream");
10866 /* Target SERVICE is the fourth argument. */
10867 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10869 DEFSYM (Qcoding_system, "coding-system");
10870 DEFSYM (Qcoding_aliases, "coding-aliases");
10872 DEFSYM (Qeol_type, "eol-type");
10873 DEFSYM (Qunix, "unix");
10874 DEFSYM (Qdos, "dos");
10875 DEFSYM (Qmac, "mac");
10877 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10878 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10879 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10880 DEFSYM (Qdefault_char, "default-char");
10881 DEFSYM (Qundecided, "undecided");
10882 DEFSYM (Qno_conversion, "no-conversion");
10883 DEFSYM (Qraw_text, "raw-text");
10885 DEFSYM (Qiso_2022, "iso-2022");
10887 DEFSYM (Qutf_8, "utf-8");
10888 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10890 #if defined (WINDOWSNT) || defined (CYGWIN)
10891 /* No, not utf-16-le: that one has a BOM. */
10892 DEFSYM (Qutf_16le, "utf-16le");
10893 #endif
10895 DEFSYM (Qutf_16, "utf-16");
10896 DEFSYM (Qbig, "big");
10897 DEFSYM (Qlittle, "little");
10899 DEFSYM (Qshift_jis, "shift-jis");
10900 DEFSYM (Qbig5, "big5");
10902 DEFSYM (Qcoding_system_p, "coding-system-p");
10904 DEFSYM (Qcoding_system_error, "coding-system-error");
10905 Fput (Qcoding_system_error, Qerror_conditions,
10906 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10907 Fput (Qcoding_system_error, Qerror_message,
10908 build_pure_c_string ("Invalid coding system"));
10910 DEFSYM (Qtranslation_table, "translation-table");
10911 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10912 DEFSYM (Qtranslation_table_id, "translation-table-id");
10913 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10914 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10916 DEFSYM (Qvalid_codes, "valid-codes");
10918 DEFSYM (Qemacs_mule, "emacs-mule");
10920 DEFSYM (QCcategory, ":category");
10921 DEFSYM (QCmnemonic, ":mnemonic");
10922 DEFSYM (QCdefault_char, ":default-char");
10923 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10924 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10925 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10926 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10927 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10929 Vcoding_category_table
10930 = Fmake_vector (make_number (coding_category_max), Qnil);
10931 staticpro (&Vcoding_category_table);
10932 /* Followings are target of code detection. */
10933 ASET (Vcoding_category_table, coding_category_iso_7,
10934 intern_c_string ("coding-category-iso-7"));
10935 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10936 intern_c_string ("coding-category-iso-7-tight"));
10937 ASET (Vcoding_category_table, coding_category_iso_8_1,
10938 intern_c_string ("coding-category-iso-8-1"));
10939 ASET (Vcoding_category_table, coding_category_iso_8_2,
10940 intern_c_string ("coding-category-iso-8-2"));
10941 ASET (Vcoding_category_table, coding_category_iso_7_else,
10942 intern_c_string ("coding-category-iso-7-else"));
10943 ASET (Vcoding_category_table, coding_category_iso_8_else,
10944 intern_c_string ("coding-category-iso-8-else"));
10945 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10946 intern_c_string ("coding-category-utf-8-auto"));
10947 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10948 intern_c_string ("coding-category-utf-8"));
10949 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10950 intern_c_string ("coding-category-utf-8-sig"));
10951 ASET (Vcoding_category_table, coding_category_utf_16_be,
10952 intern_c_string ("coding-category-utf-16-be"));
10953 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10954 intern_c_string ("coding-category-utf-16-auto"));
10955 ASET (Vcoding_category_table, coding_category_utf_16_le,
10956 intern_c_string ("coding-category-utf-16-le"));
10957 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10958 intern_c_string ("coding-category-utf-16-be-nosig"));
10959 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10960 intern_c_string ("coding-category-utf-16-le-nosig"));
10961 ASET (Vcoding_category_table, coding_category_charset,
10962 intern_c_string ("coding-category-charset"));
10963 ASET (Vcoding_category_table, coding_category_sjis,
10964 intern_c_string ("coding-category-sjis"));
10965 ASET (Vcoding_category_table, coding_category_big5,
10966 intern_c_string ("coding-category-big5"));
10967 ASET (Vcoding_category_table, coding_category_ccl,
10968 intern_c_string ("coding-category-ccl"));
10969 ASET (Vcoding_category_table, coding_category_emacs_mule,
10970 intern_c_string ("coding-category-emacs-mule"));
10971 /* Followings are NOT target of code detection. */
10972 ASET (Vcoding_category_table, coding_category_raw_text,
10973 intern_c_string ("coding-category-raw-text"));
10974 ASET (Vcoding_category_table, coding_category_undecided,
10975 intern_c_string ("coding-category-undecided"));
10977 DEFSYM (Qinsufficient_source, "insufficient-source");
10978 DEFSYM (Qinvalid_source, "invalid-source");
10979 DEFSYM (Qinterrupted, "interrupted");
10980 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10982 defsubr (&Scoding_system_p);
10983 defsubr (&Sread_coding_system);
10984 defsubr (&Sread_non_nil_coding_system);
10985 defsubr (&Scheck_coding_system);
10986 defsubr (&Sdetect_coding_region);
10987 defsubr (&Sdetect_coding_string);
10988 defsubr (&Sfind_coding_systems_region_internal);
10989 defsubr (&Sunencodable_char_position);
10990 defsubr (&Scheck_coding_systems_region);
10991 defsubr (&Sdecode_coding_region);
10992 defsubr (&Sencode_coding_region);
10993 defsubr (&Sdecode_coding_string);
10994 defsubr (&Sencode_coding_string);
10995 defsubr (&Sdecode_sjis_char);
10996 defsubr (&Sencode_sjis_char);
10997 defsubr (&Sdecode_big5_char);
10998 defsubr (&Sencode_big5_char);
10999 defsubr (&Sset_terminal_coding_system_internal);
11000 defsubr (&Sset_safe_terminal_coding_system_internal);
11001 defsubr (&Sterminal_coding_system);
11002 defsubr (&Sset_keyboard_coding_system_internal);
11003 defsubr (&Skeyboard_coding_system);
11004 defsubr (&Sfind_operation_coding_system);
11005 defsubr (&Sset_coding_system_priority);
11006 defsubr (&Sdefine_coding_system_internal);
11007 defsubr (&Sdefine_coding_system_alias);
11008 defsubr (&Scoding_system_put);
11009 defsubr (&Scoding_system_base);
11010 defsubr (&Scoding_system_plist);
11011 defsubr (&Scoding_system_aliases);
11012 defsubr (&Scoding_system_eol_type);
11013 defsubr (&Scoding_system_priority_list);
11015 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11016 doc: /* List of coding systems.
11018 Do not alter the value of this variable manually. This variable should be
11019 updated by the functions `define-coding-system' and
11020 `define-coding-system-alias'. */);
11021 Vcoding_system_list = Qnil;
11023 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11024 doc: /* Alist of coding system names.
11025 Each element is one element list of coding system name.
11026 This variable is given to `completing-read' as COLLECTION argument.
11028 Do not alter the value of this variable manually. This variable should be
11029 updated by the functions `make-coding-system' and
11030 `define-coding-system-alias'. */);
11031 Vcoding_system_alist = Qnil;
11033 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11034 doc: /* List of coding-categories (symbols) ordered by priority.
11036 On detecting a coding system, Emacs tries code detection algorithms
11037 associated with each coding-category one by one in this order. When
11038 one algorithm agrees with a byte sequence of source text, the coding
11039 system bound to the corresponding coding-category is selected.
11041 Don't modify this variable directly, but use `set-coding-system-priority'. */);
11043 int i;
11045 Vcoding_category_list = Qnil;
11046 for (i = coding_category_max - 1; i >= 0; i--)
11047 Vcoding_category_list
11048 = Fcons (AREF (Vcoding_category_table, i),
11049 Vcoding_category_list);
11052 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11053 doc: /* Specify the coding system for read operations.
11054 It is useful to bind this variable with `let', but do not set it globally.
11055 If the value is a coding system, it is used for decoding on read operation.
11056 If not, an appropriate element is used from one of the coding system alists.
11057 There are three such tables: `file-coding-system-alist',
11058 `process-coding-system-alist', and `network-coding-system-alist'. */);
11059 Vcoding_system_for_read = Qnil;
11061 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11062 doc: /* Specify the coding system for write operations.
11063 Programs bind this variable with `let', but you should not set it globally.
11064 If the value is a coding system, it is used for encoding of output,
11065 when writing it to a file and when sending it to a file or subprocess.
11067 If this does not specify a coding system, an appropriate element
11068 is used from one of the coding system alists.
11069 There are three such tables: `file-coding-system-alist',
11070 `process-coding-system-alist', and `network-coding-system-alist'.
11071 For output to files, if the above procedure does not specify a coding system,
11072 the value of `buffer-file-coding-system' is used. */);
11073 Vcoding_system_for_write = Qnil;
11075 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11076 doc: /*
11077 Coding system used in the latest file or process I/O. */);
11078 Vlast_coding_system_used = Qnil;
11080 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11081 doc: /*
11082 Error status of the last code conversion.
11084 When an error was detected in the last code conversion, this variable
11085 is set to one of the following symbols.
11086 `insufficient-source'
11087 `inconsistent-eol'
11088 `invalid-source'
11089 `interrupted'
11090 `insufficient-memory'
11091 When no error was detected, the value doesn't change. So, to check
11092 the error status of a code conversion by this variable, you must
11093 explicitly set this variable to nil before performing code
11094 conversion. */);
11095 Vlast_code_conversion_error = Qnil;
11097 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11098 doc: /*
11099 *Non-nil means always inhibit code conversion of end-of-line format.
11100 See info node `Coding Systems' and info node `Text and Binary' concerning
11101 such conversion. */);
11102 inhibit_eol_conversion = 0;
11104 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11105 doc: /*
11106 Non-nil means process buffer inherits coding system of process output.
11107 Bind it to t if the process output is to be treated as if it were a file
11108 read from some filesystem. */);
11109 inherit_process_coding_system = 0;
11111 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11112 doc: /*
11113 Alist to decide a coding system to use for a file I/O operation.
11114 The format is ((PATTERN . VAL) ...),
11115 where PATTERN is a regular expression matching a file name,
11116 VAL is a coding system, a cons of coding systems, or a function symbol.
11117 If VAL is a coding system, it is used for both decoding and encoding
11118 the file contents.
11119 If VAL is a cons of coding systems, the car part is used for decoding,
11120 and the cdr part is used for encoding.
11121 If VAL is a function symbol, the function must return a coding system
11122 or a cons of coding systems which are used as above. The function is
11123 called with an argument that is a list of the arguments with which
11124 `find-operation-coding-system' was called. If the function can't decide
11125 a coding system, it can return `undecided' so that the normal
11126 code-detection is performed.
11128 See also the function `find-operation-coding-system'
11129 and the variable `auto-coding-alist'. */);
11130 Vfile_coding_system_alist = Qnil;
11132 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11133 doc: /*
11134 Alist to decide a coding system to use for a process I/O operation.
11135 The format is ((PATTERN . VAL) ...),
11136 where PATTERN is a regular expression matching a program name,
11137 VAL is a coding system, a cons of coding systems, or a function symbol.
11138 If VAL is a coding system, it is used for both decoding what received
11139 from the program and encoding what sent to the program.
11140 If VAL is a cons of coding systems, the car part is used for decoding,
11141 and the cdr part is used for encoding.
11142 If VAL is a function symbol, the function must return a coding system
11143 or a cons of coding systems which are used as above.
11145 See also the function `find-operation-coding-system'. */);
11146 Vprocess_coding_system_alist = Qnil;
11148 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11149 doc: /*
11150 Alist to decide a coding system to use for a network I/O operation.
11151 The format is ((PATTERN . VAL) ...),
11152 where PATTERN is a regular expression matching a network service name
11153 or is a port number to connect to,
11154 VAL is a coding system, a cons of coding systems, or a function symbol.
11155 If VAL is a coding system, it is used for both decoding what received
11156 from the network stream and encoding what sent to the network stream.
11157 If VAL is a cons of coding systems, the car part is used for decoding,
11158 and the cdr part is used for encoding.
11159 If VAL is a function symbol, the function must return a coding system
11160 or a cons of coding systems which are used as above.
11162 See also the function `find-operation-coding-system'. */);
11163 Vnetwork_coding_system_alist = Qnil;
11165 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11166 doc: /* Coding system to use with system messages.
11167 Also used for decoding keyboard input on X Window system. */);
11168 Vlocale_coding_system = Qnil;
11170 /* The eol mnemonics are reset in startup.el system-dependently. */
11171 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11172 doc: /*
11173 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11174 eol_mnemonic_unix = build_pure_c_string (":");
11176 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11177 doc: /*
11178 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11179 eol_mnemonic_dos = build_pure_c_string ("\\");
11181 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11182 doc: /*
11183 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11184 eol_mnemonic_mac = build_pure_c_string ("/");
11186 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11187 doc: /*
11188 *String displayed in mode line when end-of-line format is not yet determined. */);
11189 eol_mnemonic_undecided = build_pure_c_string (":");
11191 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11192 doc: /*
11193 *Non-nil enables character translation while encoding and decoding. */);
11194 Venable_character_translation = Qt;
11196 DEFVAR_LISP ("standard-translation-table-for-decode",
11197 Vstandard_translation_table_for_decode,
11198 doc: /* Table for translating characters while decoding. */);
11199 Vstandard_translation_table_for_decode = Qnil;
11201 DEFVAR_LISP ("standard-translation-table-for-encode",
11202 Vstandard_translation_table_for_encode,
11203 doc: /* Table for translating characters while encoding. */);
11204 Vstandard_translation_table_for_encode = Qnil;
11206 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11207 doc: /* Alist of charsets vs revision numbers.
11208 While encoding, if a charset (car part of an element) is found,
11209 designate it with the escape sequence identifying revision (cdr part
11210 of the element). */);
11211 Vcharset_revision_table = Qnil;
11213 DEFVAR_LISP ("default-process-coding-system",
11214 Vdefault_process_coding_system,
11215 doc: /* Cons of coding systems used for process I/O by default.
11216 The car part is used for decoding a process output,
11217 the cdr part is used for encoding a text to be sent to a process. */);
11218 Vdefault_process_coding_system = Qnil;
11220 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11221 doc: /*
11222 Table of extra Latin codes in the range 128..159 (inclusive).
11223 This is a vector of length 256.
11224 If Nth element is non-nil, the existence of code N in a file
11225 \(or output of subprocess) doesn't prevent it to be detected as
11226 a coding system of ISO 2022 variant which has a flag
11227 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11228 or reading output of a subprocess.
11229 Only 128th through 159th elements have a meaning. */);
11230 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11232 DEFVAR_LISP ("select-safe-coding-system-function",
11233 Vselect_safe_coding_system_function,
11234 doc: /*
11235 Function to call to select safe coding system for encoding a text.
11237 If set, this function is called to force a user to select a proper
11238 coding system which can encode the text in the case that a default
11239 coding system used in each operation can't encode the text. The
11240 function should take care that the buffer is not modified while
11241 the coding system is being selected.
11243 The default value is `select-safe-coding-system' (which see). */);
11244 Vselect_safe_coding_system_function = Qnil;
11246 DEFVAR_BOOL ("coding-system-require-warning",
11247 coding_system_require_warning,
11248 doc: /* Internal use only.
11249 If non-nil, on writing a file, `select-safe-coding-system-function' is
11250 called even if `coding-system-for-write' is non-nil. The command
11251 `universal-coding-system-argument' binds this variable to t temporarily. */);
11252 coding_system_require_warning = 0;
11255 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11256 inhibit_iso_escape_detection,
11257 doc: /*
11258 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11260 When Emacs reads text, it tries to detect how the text is encoded.
11261 This code detection is sensitive to escape sequences. If Emacs sees
11262 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11263 of the ISO2022 encodings, and decodes text by the corresponding coding
11264 system (e.g. `iso-2022-7bit').
11266 However, there may be a case that you want to read escape sequences in
11267 a file as is. In such a case, you can set this variable to non-nil.
11268 Then the code detection will ignore any escape sequences, and no text is
11269 detected as encoded in some ISO-2022 encoding. The result is that all
11270 escape sequences become visible in a buffer.
11272 The default value is nil, and it is strongly recommended not to change
11273 it. That is because many Emacs Lisp source files that contain
11274 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11275 in Emacs's distribution, and they won't be decoded correctly on
11276 reading if you suppress escape sequence detection.
11278 The other way to read escape sequences in a file without decoding is
11279 to explicitly specify some coding system that doesn't use ISO-2022
11280 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11281 inhibit_iso_escape_detection = 0;
11283 DEFVAR_BOOL ("inhibit-null-byte-detection",
11284 inhibit_null_byte_detection,
11285 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11286 By default, Emacs treats it as binary data, and does not attempt to
11287 decode it. The effect is as if you specified `no-conversion' for
11288 reading that text.
11290 Set this to non-nil when a regular text happens to include null bytes.
11291 Examples are Index nodes of Info files and null-byte delimited output
11292 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11293 decode text as usual. */);
11294 inhibit_null_byte_detection = 0;
11296 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11297 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11298 Internal use only. Remove after the experimental optimizer becomes stable. */);
11299 disable_ascii_optimization = 0;
11301 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11302 doc: /* Char table for translating self-inserting characters.
11303 This is applied to the result of input methods, not their input.
11304 See also `keyboard-translate-table'.
11306 Use of this variable for character code unification was rendered
11307 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11308 internal character representation. */);
11309 Vtranslation_table_for_input = Qnil;
11312 Lisp_Object args[coding_arg_undecided_max];
11313 Lisp_Object plist[16];
11314 int i;
11316 for (i = 0; i < coding_arg_undecided_max; i++)
11317 args[i] = Qnil;
11319 plist[0] = intern_c_string (":name");
11320 plist[1] = args[coding_arg_name] = Qno_conversion;
11321 plist[2] = intern_c_string (":mnemonic");
11322 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11323 plist[4] = intern_c_string (":coding-type");
11324 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11325 plist[6] = intern_c_string (":ascii-compatible-p");
11326 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11327 plist[8] = intern_c_string (":default-char");
11328 plist[9] = args[coding_arg_default_char] = make_number (0);
11329 plist[10] = intern_c_string (":for-unibyte");
11330 plist[11] = args[coding_arg_for_unibyte] = Qt;
11331 plist[12] = intern_c_string (":docstring");
11332 plist[13] = build_pure_c_string ("Do no conversion.\n\
11334 When you visit a file with this coding, the file is read into a\n\
11335 unibyte buffer as is, thus each byte of a file is treated as a\n\
11336 character.");
11337 plist[14] = intern_c_string (":eol-type");
11338 plist[15] = args[coding_arg_eol_type] = Qunix;
11339 args[coding_arg_plist] = Flist (16, plist);
11340 Fdefine_coding_system_internal (coding_arg_max, args);
11342 plist[1] = args[coding_arg_name] = Qundecided;
11343 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11344 plist[5] = args[coding_arg_coding_type] = Qundecided;
11345 /* This is already set.
11346 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11347 plist[8] = intern_c_string (":charset-list");
11348 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11349 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11350 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11351 plist[15] = args[coding_arg_eol_type] = Qnil;
11352 args[coding_arg_plist] = Flist (16, plist);
11353 args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11354 args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11355 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11358 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11361 int i;
11363 for (i = 0; i < coding_category_max; i++)
11364 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11366 #if defined (DOS_NT)
11367 system_eol_type = Qdos;
11368 #else
11369 system_eol_type = Qunix;
11370 #endif
11371 staticpro (&system_eol_type);
11374 char *
11375 emacs_strerror (int error_number)
11377 char *str;
11379 synchronize_system_messages_locale ();
11380 str = strerror (error_number);
11382 if (! NILP (Vlocale_coding_system))
11384 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11385 Vlocale_coding_system,
11387 str = SSDATA (dec);
11390 return str;
11393 #endif /* emacs */