Backport Tramp changes from trunk.
[emacs.git] / src / coding.c
blobb0a9f6ef4cb1ed335f0e502aec670980b95683f3
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2014 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_LEVEL_4 0x20000
498 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
500 /* A character to be produced on output if encoding of the original
501 character is prohibited by CODING_ISO_FLAG_SAFE. */
502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
504 /* UTF-8 section */
505 #define CODING_UTF_8_BOM(coding) \
506 ((coding)->spec.utf_8_bom)
508 /* UTF-16 section */
509 #define CODING_UTF_16_BOM(coding) \
510 ((coding)->spec.utf_16.bom)
512 #define CODING_UTF_16_ENDIAN(coding) \
513 ((coding)->spec.utf_16.endian)
515 #define CODING_UTF_16_SURROGATE(coding) \
516 ((coding)->spec.utf_16.surrogate)
519 /* CCL section */
520 #define CODING_CCL_DECODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
522 #define CODING_CCL_ENCODER(coding) \
523 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
524 #define CODING_CCL_VALIDS(coding) \
525 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
527 /* Index for each coding category in `coding_categories' */
529 enum coding_category
531 coding_category_iso_7,
532 coding_category_iso_7_tight,
533 coding_category_iso_8_1,
534 coding_category_iso_8_2,
535 coding_category_iso_7_else,
536 coding_category_iso_8_else,
537 coding_category_utf_8_auto,
538 coding_category_utf_8_nosig,
539 coding_category_utf_8_sig,
540 coding_category_utf_16_auto,
541 coding_category_utf_16_be,
542 coding_category_utf_16_le,
543 coding_category_utf_16_be_nosig,
544 coding_category_utf_16_le_nosig,
545 coding_category_charset,
546 coding_category_sjis,
547 coding_category_big5,
548 coding_category_ccl,
549 coding_category_emacs_mule,
550 /* All above are targets of code detection. */
551 coding_category_raw_text,
552 coding_category_undecided,
553 coding_category_max
556 /* Definitions of flag bits used in detect_coding_XXXX. */
557 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
558 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
559 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
560 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
561 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
562 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
563 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
564 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
565 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
566 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
567 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
568 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
569 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
570 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
571 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
572 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
573 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
574 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
575 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
576 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
578 /* This value is returned if detect_coding_mask () find nothing other
579 than ASCII characters. */
580 #define CATEGORY_MASK_ANY \
581 (CATEGORY_MASK_ISO_7 \
582 | CATEGORY_MASK_ISO_7_TIGHT \
583 | CATEGORY_MASK_ISO_8_1 \
584 | CATEGORY_MASK_ISO_8_2 \
585 | CATEGORY_MASK_ISO_7_ELSE \
586 | CATEGORY_MASK_ISO_8_ELSE \
587 | CATEGORY_MASK_UTF_8_AUTO \
588 | CATEGORY_MASK_UTF_8_NOSIG \
589 | CATEGORY_MASK_UTF_8_SIG \
590 | CATEGORY_MASK_UTF_16_AUTO \
591 | CATEGORY_MASK_UTF_16_BE \
592 | CATEGORY_MASK_UTF_16_LE \
593 | CATEGORY_MASK_UTF_16_BE_NOSIG \
594 | CATEGORY_MASK_UTF_16_LE_NOSIG \
595 | CATEGORY_MASK_CHARSET \
596 | CATEGORY_MASK_SJIS \
597 | CATEGORY_MASK_BIG5 \
598 | CATEGORY_MASK_CCL \
599 | CATEGORY_MASK_EMACS_MULE)
602 #define CATEGORY_MASK_ISO_7BIT \
603 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
605 #define CATEGORY_MASK_ISO_8BIT \
606 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
608 #define CATEGORY_MASK_ISO_ELSE \
609 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
611 #define CATEGORY_MASK_ISO_ESCAPE \
612 (CATEGORY_MASK_ISO_7 \
613 | CATEGORY_MASK_ISO_7_TIGHT \
614 | CATEGORY_MASK_ISO_7_ELSE \
615 | CATEGORY_MASK_ISO_8_ELSE)
617 #define CATEGORY_MASK_ISO \
618 ( CATEGORY_MASK_ISO_7BIT \
619 | CATEGORY_MASK_ISO_8BIT \
620 | CATEGORY_MASK_ISO_ELSE)
622 #define CATEGORY_MASK_UTF_16 \
623 (CATEGORY_MASK_UTF_16_AUTO \
624 | CATEGORY_MASK_UTF_16_BE \
625 | CATEGORY_MASK_UTF_16_LE \
626 | CATEGORY_MASK_UTF_16_BE_NOSIG \
627 | CATEGORY_MASK_UTF_16_LE_NOSIG)
629 #define CATEGORY_MASK_UTF_8 \
630 (CATEGORY_MASK_UTF_8_AUTO \
631 | CATEGORY_MASK_UTF_8_NOSIG \
632 | CATEGORY_MASK_UTF_8_SIG)
634 /* Table of coding categories (Lisp symbols). This variable is for
635 internal use only. */
636 static Lisp_Object Vcoding_category_table;
638 /* Table of coding-categories ordered by priority. */
639 static enum coding_category coding_priorities[coding_category_max];
641 /* Nth element is a coding context for the coding system bound to the
642 Nth coding category. */
643 static struct coding_system coding_categories[coding_category_max];
645 /*** Commonly used macros and functions ***/
647 #ifndef min
648 #define min(a, b) ((a) < (b) ? (a) : (b))
649 #endif
650 #ifndef max
651 #define max(a, b) ((a) > (b) ? (a) : (b))
652 #endif
654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
656 static int
657 encode_inhibit_flag (Lisp_Object flag)
659 return NILP (flag) ? -1 : EQ (flag, Qt);
662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
663 1 means yes, -1 means no, 0 means ask the user variable VAR. */
665 static bool
666 inhibit_flag (int encoded_flag, bool var)
668 return 0 < encoded_flag + var;
671 #define CODING_GET_INFO(coding, attrs, charset_list) \
672 do { \
673 (attrs) = CODING_ID_ATTRS ((coding)->id); \
674 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
675 } while (0)
677 static void
678 CHECK_NATNUM_CAR (Lisp_Object x)
680 Lisp_Object tmp = XCAR (x);
681 CHECK_NATNUM (tmp);
682 XSETCAR (x, tmp);
685 static void
686 CHECK_NATNUM_CDR (Lisp_Object x)
688 Lisp_Object tmp = XCDR (x);
689 CHECK_NATNUM (tmp);
690 XSETCDR (x, tmp);
694 /* Safely get one byte from the source text pointed by SRC which ends
695 at SRC_END, and set C to that byte. If there are not enough bytes
696 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
697 and a multibyte character is found at SRC, set C to the
698 negative value of the character code. The caller should declare
699 and set these variables appropriately in advance:
700 src, src_end, multibytep */
702 #define ONE_MORE_BYTE(c) \
703 do { \
704 if (src == src_end) \
706 if (src_base < src) \
707 record_conversion_result \
708 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
709 goto no_more_source; \
711 c = *src++; \
712 if (multibytep && (c & 0x80)) \
714 if ((c & 0xFE) == 0xC0) \
715 c = ((c & 1) << 6) | *src++; \
716 else \
718 src--; \
719 c = - string_char (src, &src, NULL); \
720 record_conversion_result \
721 (coding, CODING_RESULT_INVALID_SRC); \
724 consumed_chars++; \
725 } while (0)
727 /* Safely get two bytes from the source text pointed by SRC which ends
728 at SRC_END, and set C1 and C2 to those bytes while skipping the
729 heading multibyte characters. If there are not enough bytes in the
730 source, it jumps to 'no_more_source'. If MULTIBYTEP and
731 a multibyte character is found for C2, set C2 to the negative value
732 of the character code. The caller should declare and set these
733 variables appropriately in advance:
734 src, src_end, multibytep
735 It is intended that this macro is used in detect_coding_utf_16. */
737 #define TWO_MORE_BYTES(c1, c2) \
738 do { \
739 do { \
740 if (src == src_end) \
741 goto no_more_source; \
742 c1 = *src++; \
743 if (multibytep && (c1 & 0x80)) \
745 if ((c1 & 0xFE) == 0xC0) \
746 c1 = ((c1 & 1) << 6) | *src++; \
747 else \
749 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
750 c1 = -1; \
753 } while (c1 < 0); \
754 if (src == src_end) \
755 goto no_more_source; \
756 c2 = *src++; \
757 if (multibytep && (c2 & 0x80)) \
759 if ((c2 & 0xFE) == 0xC0) \
760 c2 = ((c2 & 1) << 6) | *src++; \
761 else \
762 c2 = -1; \
764 } while (0)
767 /* Store a byte C in the place pointed by DST and increment DST to the
768 next free point, and increment PRODUCED_CHARS. The caller should
769 assure that C is 0..127, and declare and set the variable `dst'
770 appropriately in advance.
774 #define EMIT_ONE_ASCII_BYTE(c) \
775 do { \
776 produced_chars++; \
777 *dst++ = (c); \
778 } while (0)
781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
783 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
784 do { \
785 produced_chars += 2; \
786 *dst++ = (c1), *dst++ = (c2); \
787 } while (0)
790 /* Store a byte C in the place pointed by DST and increment DST to the
791 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
792 store in an appropriate multibyte form. The caller should
793 declare and set the variables `dst' and `multibytep' appropriately
794 in advance. */
796 #define EMIT_ONE_BYTE(c) \
797 do { \
798 produced_chars++; \
799 if (multibytep) \
801 unsigned ch = (c); \
802 if (ch >= 0x80) \
803 ch = BYTE8_TO_CHAR (ch); \
804 CHAR_STRING_ADVANCE (ch, dst); \
806 else \
807 *dst++ = (c); \
808 } while (0)
811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
813 #define EMIT_TWO_BYTES(c1, c2) \
814 do { \
815 produced_chars += 2; \
816 if (multibytep) \
818 unsigned ch; \
820 ch = (c1); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 ch = (c2); \
825 if (ch >= 0x80) \
826 ch = BYTE8_TO_CHAR (ch); \
827 CHAR_STRING_ADVANCE (ch, dst); \
829 else \
831 *dst++ = (c1); \
832 *dst++ = (c2); \
834 } while (0)
837 #define EMIT_THREE_BYTES(c1, c2, c3) \
838 do { \
839 EMIT_ONE_BYTE (c1); \
840 EMIT_TWO_BYTES (c2, c3); \
841 } while (0)
844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
845 do { \
846 EMIT_TWO_BYTES (c1, c2); \
847 EMIT_TWO_BYTES (c3, c4); \
848 } while (0)
851 static void
852 record_conversion_result (struct coding_system *coding,
853 enum coding_result_code result)
855 coding->result = result;
856 switch (result)
858 case CODING_RESULT_INSUFFICIENT_SRC:
859 Vlast_code_conversion_error = Qinsufficient_source;
860 break;
861 case CODING_RESULT_INVALID_SRC:
862 Vlast_code_conversion_error = Qinvalid_source;
863 break;
864 case CODING_RESULT_INTERRUPT:
865 Vlast_code_conversion_error = Qinterrupted;
866 break;
867 case CODING_RESULT_INSUFFICIENT_DST:
868 /* Don't record this error in Vlast_code_conversion_error
869 because it happens just temporarily and is resolved when the
870 whole conversion is finished. */
871 break;
872 case CODING_RESULT_SUCCESS:
873 break;
874 default:
875 Vlast_code_conversion_error = intern ("Unknown error");
879 /* These wrapper macros are used to preserve validity of pointers into
880 buffer text across calls to decode_char, encode_char, etc, which
881 could cause relocation of buffers if it loads a charset map,
882 because loading a charset map allocates large structures. */
884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
885 do { \
886 ptrdiff_t offset; \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded \
891 && (offset = coding_change_source (coding))) \
893 src += offset; \
894 src_base += offset; \
895 src_end += offset; \
897 } while (0)
899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
900 do { \
901 ptrdiff_t offset; \
903 charset_map_loaded = 0; \
904 code = ENCODE_CHAR (charset, c); \
905 if (charset_map_loaded \
906 && (offset = coding_change_destination (coding))) \
908 dst += offset; \
909 dst_end += offset; \
911 } while (0)
913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
914 do { \
915 ptrdiff_t offset; \
917 charset_map_loaded = 0; \
918 charset = char_charset (c, charset_list, code_return); \
919 if (charset_map_loaded \
920 && (offset = coding_change_destination (coding))) \
922 dst += offset; \
923 dst_end += offset; \
925 } while (0)
927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
928 do { \
929 ptrdiff_t offset; \
931 charset_map_loaded = 0; \
932 result = CHAR_CHARSET_P (c, charset); \
933 if (charset_map_loaded \
934 && (offset = coding_change_destination (coding))) \
936 dst += offset; \
937 dst_end += offset; \
939 } while (0)
942 /* If there are at least BYTES length of room at dst, allocate memory
943 for coding->destination and update dst and dst_end. We don't have
944 to take care of coding->source which will be relocated. It is
945 handled by calling coding_set_source in encode_coding. */
947 #define ASSURE_DESTINATION(bytes) \
948 do { \
949 if (dst + (bytes) >= dst_end) \
951 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
953 dst = alloc_destination (coding, more_bytes, dst); \
954 dst_end = coding->destination + coding->dst_bytes; \
956 } while (0)
959 /* Store multibyte form of the character C in P, and advance P to the
960 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
961 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
962 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
966 /* Return the character code of character whose multibyte form is at
967 P, and advance P to the end of the multibyte form. This used to be
968 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
969 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
973 /* Set coding->source from coding->src_object. */
975 static void
976 coding_set_source (struct coding_system *coding)
978 if (BUFFERP (coding->src_object))
980 struct buffer *buf = XBUFFER (coding->src_object);
982 if (coding->src_pos < 0)
983 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
984 else
985 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
987 else if (STRINGP (coding->src_object))
989 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
991 else
993 /* Otherwise, the source is C string and is never relocated
994 automatically. Thus we don't have to update anything. */
999 /* Set coding->source from coding->src_object, and return how many
1000 bytes coding->source was changed. */
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1005 const unsigned char *orig = coding->source;
1006 coding_set_source (coding);
1007 return coding->source - orig;
1011 /* Set coding->destination from coding->dst_object. */
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1016 if (BUFFERP (coding->dst_object))
1018 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1020 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021 coding->dst_bytes = (GAP_END_ADDR
1022 - (coding->src_bytes - coding->consumed)
1023 - coding->destination);
1025 else
1027 /* We are sure that coding->dst_pos_byte is before the gap
1028 of the buffer. */
1029 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030 + coding->dst_pos_byte - BEG_BYTE);
1031 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032 - coding->destination);
1035 else
1037 /* Otherwise, the destination is C string and is never relocated
1038 automatically. Thus we don't have to update anything. */
1043 /* Set coding->destination from coding->dst_object, and return how
1044 many bytes coding->destination was changed. */
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1049 const unsigned char *orig = coding->destination;
1050 coding_set_destination (coding);
1051 return coding->destination - orig;
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1058 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059 string_overflow ();
1060 coding->destination = xrealloc (coding->destination,
1061 coding->dst_bytes + bytes);
1062 coding->dst_bytes += bytes;
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1069 if (EQ (coding->src_object, coding->dst_object))
1071 /* The gap may contain the produced data at the head and not-yet
1072 consumed data at the tail. To preserve those data, we at
1073 first make the gap size to zero, then increase the gap
1074 size. */
1075 ptrdiff_t add = GAP_SIZE;
1077 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079 make_gap (bytes);
1080 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1083 else
1084 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090 unsigned char *dst)
1092 ptrdiff_t offset = dst - coding->destination;
1094 if (BUFFERP (coding->dst_object))
1096 struct buffer *buf = XBUFFER (coding->dst_object);
1098 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1100 else
1101 coding_alloc_by_realloc (coding, nbytes);
1102 coding_set_destination (coding);
1103 dst = coding->destination + offset;
1104 return dst;
1107 /** Macros for annotations. */
1109 /* An annotation data is stored in the array coding->charbuf in this
1110 format:
1111 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112 LENGTH is the number of elements in the annotation.
1113 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114 NCHARS is the number of characters in the text annotated.
1116 The format of the following elements depend on ANNOTATION_MASK.
1118 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119 follows:
1120 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1122 NBYTES is the number of bytes specified in the header part of
1123 old-style emacs-mule encoding, or 0 for the other kind of
1124 composition.
1126 METHOD is one of enum composition_method.
1128 Optional COMPOSITION-COMPONENTS are characters and composition
1129 rules.
1131 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132 follows.
1134 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135 recover from an invalid annotation, and should be skipped by
1136 produce_annotation. */
1138 /* Maximum length of the header of annotation data. */
1139 #define MAX_ANNOTATION_LENGTH 5
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1142 do { \
1143 *(buf)++ = -(len); \
1144 *(buf)++ = (mask); \
1145 *(buf)++ = (nchars); \
1146 coding->annotated = 1; \
1147 } while (0);
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1150 do { \
1151 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152 *buf++ = nbytes; \
1153 *buf++ = method; \
1154 } while (0)
1157 #define ADD_CHARSET_DATA(buf, nchars, id) \
1158 do { \
1159 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160 *buf++ = id; \
1161 } while (0)
1164 /* Bitmasks for coding->eol_seen. */
1166 #define EOL_SEEN_NONE 0
1167 #define EOL_SEEN_LF 1
1168 #define EOL_SEEN_CR 2
1169 #define EOL_SEEN_CRLF 4
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1177 /*** 3. UTF-8 ***/
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180 Return true if a text is encoded in UTF-8. */
1182 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1193 /* Unlike the other detect_coding_XXX, this function counts the number
1194 of characters and checks the EOL format. */
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198 struct coding_detection_info *detect_info)
1200 const unsigned char *src = coding->source, *src_base;
1201 const unsigned char *src_end = coding->source + coding->src_bytes;
1202 bool multibytep = coding->src_multibyte;
1203 ptrdiff_t consumed_chars = 0;
1204 bool bom_found = 0;
1205 ptrdiff_t nchars = coding->head_ascii;
1206 int eol_seen = coding->eol_seen;
1208 detect_info->checked |= CATEGORY_MASK_UTF_8;
1209 /* A coding system of this category is always ASCII compatible. */
1210 src += nchars;
1212 if (src == coding->source /* BOM should be at the head. */
1213 && src + 3 < src_end /* BOM is 3-byte long. */
1214 && src[0] == UTF_8_BOM_1
1215 && src[1] == UTF_8_BOM_2
1216 && src[2] == UTF_8_BOM_3)
1218 bom_found = 1;
1219 src += 3;
1220 nchars++;
1223 while (1)
1225 int c, c1, c2, c3, c4;
1227 src_base = src;
1228 ONE_MORE_BYTE (c);
1229 if (c < 0 || UTF_8_1_OCTET_P (c))
1231 nchars++;
1232 if (c == '\r')
1234 if (src < src_end && *src == '\n')
1236 eol_seen |= EOL_SEEN_CRLF;
1237 src++;
1238 nchars++;
1240 else
1241 eol_seen |= EOL_SEEN_CR;
1243 else if (c == '\n')
1244 eol_seen |= EOL_SEEN_LF;
1245 continue;
1247 ONE_MORE_BYTE (c1);
1248 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249 break;
1250 if (UTF_8_2_OCTET_LEADING_P (c))
1252 nchars++;
1253 continue;
1255 ONE_MORE_BYTE (c2);
1256 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257 break;
1258 if (UTF_8_3_OCTET_LEADING_P (c))
1260 nchars++;
1261 continue;
1263 ONE_MORE_BYTE (c3);
1264 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265 break;
1266 if (UTF_8_4_OCTET_LEADING_P (c))
1268 nchars++;
1269 continue;
1271 ONE_MORE_BYTE (c4);
1272 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273 break;
1274 if (UTF_8_5_OCTET_LEADING_P (c))
1276 nchars++;
1277 continue;
1279 break;
1281 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282 return 0;
1284 no_more_source:
1285 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1287 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288 return 0;
1290 if (bom_found)
1292 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1293 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1295 else
1297 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298 if (nchars < src_end - coding->source)
1299 /* The found characters are less than source bytes, which
1300 means that we found a valid non-ASCII characters. */
1301 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1303 coding->detected_utf8_bytes = src_base - coding->source;
1304 coding->detected_utf8_chars = nchars;
1305 return 1;
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1312 const unsigned char *src = coding->source + coding->consumed;
1313 const unsigned char *src_end = coding->source + coding->src_bytes;
1314 const unsigned char *src_base;
1315 int *charbuf = coding->charbuf + coding->charbuf_used;
1316 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318 bool multibytep = coding->src_multibyte;
1319 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320 bool eol_dos
1321 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322 int byte_after_cr = -1;
1324 if (bom != utf_without_bom)
1326 int c1, c2, c3;
1328 src_base = src;
1329 ONE_MORE_BYTE (c1);
1330 if (! UTF_8_3_OCTET_LEADING_P (c1))
1331 src = src_base;
1332 else
1334 ONE_MORE_BYTE (c2);
1335 if (! UTF_8_EXTRA_OCTET_P (c2))
1336 src = src_base;
1337 else
1339 ONE_MORE_BYTE (c3);
1340 if (! UTF_8_EXTRA_OCTET_P (c3))
1341 src = src_base;
1342 else
1344 if ((c1 != UTF_8_BOM_1)
1345 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346 src = src_base;
1347 else
1348 CODING_UTF_8_BOM (coding) = utf_without_bom;
1353 CODING_UTF_8_BOM (coding) = utf_without_bom;
1355 while (1)
1357 int c, c1, c2, c3, c4, c5;
1359 src_base = src;
1360 consumed_chars_base = consumed_chars;
1362 if (charbuf >= charbuf_end)
1364 if (byte_after_cr >= 0)
1365 src_base--;
1366 break;
1369 /* In the simple case, rapidly handle ordinary characters */
1370 if (multibytep && ! eol_dos
1371 && charbuf < charbuf_end - 6 && src < src_end - 6)
1373 while (charbuf < charbuf_end - 6 && src < src_end - 6)
1375 c1 = *src;
1376 if (c1 & 0x80)
1377 break;
1378 src++;
1379 consumed_chars++;
1380 *charbuf++ = c1;
1382 c1 = *src;
1383 if (c1 & 0x80)
1384 break;
1385 src++;
1386 consumed_chars++;
1387 *charbuf++ = c1;
1389 c1 = *src;
1390 if (c1 & 0x80)
1391 break;
1392 src++;
1393 consumed_chars++;
1394 *charbuf++ = c1;
1396 c1 = *src;
1397 if (c1 & 0x80)
1398 break;
1399 src++;
1400 consumed_chars++;
1401 *charbuf++ = c1;
1403 /* If we handled at least one character, restart the main loop. */
1404 if (src != src_base)
1405 continue;
1408 if (byte_after_cr >= 0)
1409 c1 = byte_after_cr, byte_after_cr = -1;
1410 else
1411 ONE_MORE_BYTE (c1);
1412 if (c1 < 0)
1414 c = - c1;
1416 else if (UTF_8_1_OCTET_P (c1))
1418 if (eol_dos && c1 == '\r')
1419 ONE_MORE_BYTE (byte_after_cr);
1420 c = c1;
1422 else
1424 ONE_MORE_BYTE (c2);
1425 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426 goto invalid_code;
1427 if (UTF_8_2_OCTET_LEADING_P (c1))
1429 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430 /* Reject overlong sequences here and below. Encoders
1431 producing them are incorrect, they can be misleading,
1432 and they mess up read/write invariance. */
1433 if (c < 128)
1434 goto invalid_code;
1436 else
1438 ONE_MORE_BYTE (c3);
1439 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440 goto invalid_code;
1441 if (UTF_8_3_OCTET_LEADING_P (c1))
1443 c = (((c1 & 0xF) << 12)
1444 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445 if (c < 0x800
1446 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447 goto invalid_code;
1449 else
1451 ONE_MORE_BYTE (c4);
1452 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453 goto invalid_code;
1454 if (UTF_8_4_OCTET_LEADING_P (c1))
1456 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458 if (c < 0x10000)
1459 goto invalid_code;
1461 else
1463 ONE_MORE_BYTE (c5);
1464 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465 goto invalid_code;
1466 if (UTF_8_5_OCTET_LEADING_P (c1))
1468 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470 | (c5 & 0x3F));
1471 if ((c > MAX_CHAR) || (c < 0x200000))
1472 goto invalid_code;
1474 else
1475 goto invalid_code;
1481 *charbuf++ = c;
1482 continue;
1484 invalid_code:
1485 src = src_base;
1486 consumed_chars = consumed_chars_base;
1487 ONE_MORE_BYTE (c);
1488 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1489 coding->errors++;
1492 no_more_source:
1493 coding->consumed_char += consumed_chars_base;
1494 coding->consumed = src_base - coding->source;
1495 coding->charbuf_used = charbuf - coding->charbuf;
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1502 bool multibytep = coding->dst_multibyte;
1503 int *charbuf = coding->charbuf;
1504 int *charbuf_end = charbuf + coding->charbuf_used;
1505 unsigned char *dst = coding->destination + coding->produced;
1506 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507 ptrdiff_t produced_chars = 0;
1508 int c;
1510 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1512 ASSURE_DESTINATION (3);
1513 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514 CODING_UTF_8_BOM (coding) = utf_without_bom;
1517 if (multibytep)
1519 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1521 while (charbuf < charbuf_end)
1523 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1525 ASSURE_DESTINATION (safe_room);
1526 c = *charbuf++;
1527 if (CHAR_BYTE8_P (c))
1529 c = CHAR_TO_BYTE8 (c);
1530 EMIT_ONE_BYTE (c);
1532 else
1534 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535 for (p = str; p < pend; p++)
1536 EMIT_ONE_BYTE (*p);
1540 else
1542 int safe_room = MAX_MULTIBYTE_LENGTH;
1544 while (charbuf < charbuf_end)
1546 ASSURE_DESTINATION (safe_room);
1547 c = *charbuf++;
1548 if (CHAR_BYTE8_P (c))
1549 *dst++ = CHAR_TO_BYTE8 (c);
1550 else
1551 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1553 produced_chars = dst - (coding->destination + coding->produced);
1555 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556 coding->produced_char += produced_chars;
1557 coding->produced = dst - coding->destination;
1558 return 0;
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563 Return true if a text is encoded in one of UTF-16 based coding systems. */
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566 (((val) & 0xFC00) == 0xD800)
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569 (((val) & 0xFC00) == 0xDC00)
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574 struct coding_detection_info *detect_info)
1576 const unsigned char *src = coding->source;
1577 const unsigned char *src_end = coding->source + coding->src_bytes;
1578 bool multibytep = coding->src_multibyte;
1579 int c1, c2;
1581 detect_info->checked |= CATEGORY_MASK_UTF_16;
1582 if (coding->mode & CODING_MODE_LAST_BLOCK
1583 && (coding->src_chars & 1))
1585 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586 return 0;
1589 TWO_MORE_BYTES (c1, c2);
1590 if ((c1 == 0xFF) && (c2 == 0xFE))
1592 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593 | CATEGORY_MASK_UTF_16_AUTO);
1594 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1598 else if ((c1 == 0xFE) && (c2 == 0xFF))
1600 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601 | CATEGORY_MASK_UTF_16_AUTO);
1602 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1606 else if (c2 < 0)
1608 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609 return 0;
1611 else
1613 /* We check the dispersion of Eth and Oth bytes where E is even and
1614 O is odd. If both are high, we assume binary data.*/
1615 unsigned char e[256], o[256];
1616 unsigned e_num = 1, o_num = 1;
1618 memset (e, 0, 256);
1619 memset (o, 0, 256);
1620 e[c1] = 1;
1621 o[c2] = 1;
1623 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624 |CATEGORY_MASK_UTF_16_BE
1625 | CATEGORY_MASK_UTF_16_LE);
1627 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628 != CATEGORY_MASK_UTF_16)
1630 TWO_MORE_BYTES (c1, c2);
1631 if (c2 < 0)
1632 break;
1633 if (! e[c1])
1635 e[c1] = 1;
1636 e_num++;
1637 if (e_num >= 128)
1638 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1640 if (! o[c2])
1642 o[c2] = 1;
1643 o_num++;
1644 if (o_num >= 128)
1645 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1648 return 0;
1651 no_more_source:
1652 return 1;
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1658 const unsigned char *src = coding->source + coding->consumed;
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base;
1661 int *charbuf = coding->charbuf + coding->charbuf_used;
1662 /* We may produces at most 3 chars in one loop. */
1663 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665 bool multibytep = coding->src_multibyte;
1666 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668 int surrogate = CODING_UTF_16_SURROGATE (coding);
1669 bool eol_dos
1670 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1673 if (bom == utf_with_bom)
1675 int c, c1, c2;
1677 src_base = src;
1678 ONE_MORE_BYTE (c1);
1679 ONE_MORE_BYTE (c2);
1680 c = (c1 << 8) | c2;
1682 if (endian == utf_16_big_endian
1683 ? c != 0xFEFF : c != 0xFFFE)
1685 /* The first two bytes are not BOM. Treat them as bytes
1686 for a normal character. */
1687 src = src_base;
1688 coding->errors++;
1690 CODING_UTF_16_BOM (coding) = utf_without_bom;
1692 else if (bom == utf_detect_bom)
1694 /* We have already tried to detect BOM and failed in
1695 detect_coding. */
1696 CODING_UTF_16_BOM (coding) = utf_without_bom;
1699 while (1)
1701 int c, c1, c2;
1703 src_base = src;
1704 consumed_chars_base = consumed_chars;
1706 if (charbuf >= charbuf_end)
1708 if (byte_after_cr1 >= 0)
1709 src_base -= 2;
1710 break;
1713 if (byte_after_cr1 >= 0)
1714 c1 = byte_after_cr1, byte_after_cr1 = -1;
1715 else
1716 ONE_MORE_BYTE (c1);
1717 if (c1 < 0)
1719 *charbuf++ = -c1;
1720 continue;
1722 if (byte_after_cr2 >= 0)
1723 c2 = byte_after_cr2, byte_after_cr2 = -1;
1724 else
1725 ONE_MORE_BYTE (c2);
1726 if (c2 < 0)
1728 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729 *charbuf++ = -c2;
1730 continue;
1732 c = (endian == utf_16_big_endian
1733 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1735 if (surrogate)
1737 if (! UTF_16_LOW_SURROGATE_P (c))
1739 if (endian == utf_16_big_endian)
1740 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741 else
1742 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743 *charbuf++ = c1;
1744 *charbuf++ = c2;
1745 coding->errors++;
1746 if (UTF_16_HIGH_SURROGATE_P (c))
1747 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748 else
1749 *charbuf++ = c;
1751 else
1753 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755 *charbuf++ = 0x10000 + c;
1758 else
1760 if (UTF_16_HIGH_SURROGATE_P (c))
1761 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762 else
1764 if (eol_dos && c == '\r')
1766 ONE_MORE_BYTE (byte_after_cr1);
1767 ONE_MORE_BYTE (byte_after_cr2);
1769 *charbuf++ = c;
1774 no_more_source:
1775 coding->consumed_char += consumed_chars_base;
1776 coding->consumed = src_base - coding->source;
1777 coding->charbuf_used = charbuf - coding->charbuf;
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1783 bool multibytep = coding->dst_multibyte;
1784 int *charbuf = coding->charbuf;
1785 int *charbuf_end = charbuf + coding->charbuf_used;
1786 unsigned char *dst = coding->destination + coding->produced;
1787 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788 int safe_room = 8;
1789 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791 ptrdiff_t produced_chars = 0;
1792 int c;
1794 if (bom != utf_without_bom)
1796 ASSURE_DESTINATION (safe_room);
1797 if (big_endian)
1798 EMIT_TWO_BYTES (0xFE, 0xFF);
1799 else
1800 EMIT_TWO_BYTES (0xFF, 0xFE);
1801 CODING_UTF_16_BOM (coding) = utf_without_bom;
1804 while (charbuf < charbuf_end)
1806 ASSURE_DESTINATION (safe_room);
1807 c = *charbuf++;
1808 if (c > MAX_UNICODE_CHAR)
1809 c = coding->default_char;
1811 if (c < 0x10000)
1813 if (big_endian)
1814 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815 else
1816 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1818 else
1820 int c1, c2;
1822 c -= 0x10000;
1823 c1 = (c >> 10) + 0xD800;
1824 c2 = (c & 0x3FF) + 0xDC00;
1825 if (big_endian)
1826 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827 else
1828 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1831 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832 coding->produced = dst - coding->destination;
1833 coding->produced_char += produced_chars;
1834 return 0;
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1840 /* Emacs' internal format for representation of multiple character
1841 sets is a kind of multi-byte encoding, i.e. characters are
1842 represented by variable-length sequences of one-byte codes.
1844 ASCII characters and control characters (e.g. `tab', `newline') are
1845 represented by one-byte sequences which are their ASCII codes, in
1846 the range 0x00 through 0x7F.
1848 8-bit characters of the range 0x80..0x9F are represented by
1849 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850 code + 0x20).
1852 8-bit characters of the range 0xA0..0xFF are represented by
1853 one-byte sequences which are their 8-bit code.
1855 The other characters are represented by a sequence of `base
1856 leading-code', optional `extended leading-code', and one or two
1857 `position-code's. The length of the sequence is determined by the
1858 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1859 whereas extended leading-code and position-code take the range 0xA0
1860 through 0xFF. See `charset.h' for more details about leading-code
1861 and position-code.
1863 --- CODE RANGE of Emacs' internal format ---
1864 character set range
1865 ------------- -----
1866 ascii 0x00..0x7F
1867 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868 eight-bit-graphic 0xA0..0xBF
1869 ELSE 0x81..0x9D + [0xA0..0xFF]+
1870 ---------------------------------------------
1872 As this is the internal character representation, the format is
1873 usually not used externally (i.e. in a file or in a data sent to a
1874 process). But, it is possible to have a text externally in this
1875 format (i.e. by encoding by the coding system `emacs-mule').
1877 In that case, a sequence of one-byte codes has a slightly different
1878 form.
1880 At first, all characters in eight-bit-control are represented by
1881 one-byte sequences which are their 8-bit code.
1883 Next, character composition data are represented by the byte
1884 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885 where,
1886 METHOD is 0xF2 plus one of composition method (enum
1887 composition_method),
1889 BYTES is 0xA0 plus a byte length of this composition data,
1891 CHARS is 0xA0 plus a number of characters composed by this
1892 data,
1894 COMPONENTs are characters of multibyte form or composition
1895 rules encoded by two-byte of ASCII codes.
1897 In addition, for backward compatibility, the following formats are
1898 also recognized as composition data on decoding.
1900 0x80 MSEQ ...
1901 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1903 Here,
1904 MSEQ is a multibyte form but in these special format:
1905 ASCII: 0xA0 ASCII_CODE+0x80,
1906 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907 RULE is a one byte code of the range 0xA0..0xF0 that
1908 represents a composition rule.
1911 char emacs_mule_bytes[256];
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915 Return true if a text is encoded in 'emacs-mule'. */
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919 struct coding_detection_info *detect_info)
1921 const unsigned char *src = coding->source, *src_base;
1922 const unsigned char *src_end = coding->source + coding->src_bytes;
1923 bool multibytep = coding->src_multibyte;
1924 ptrdiff_t consumed_chars = 0;
1925 int c;
1926 int found = 0;
1928 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929 /* A coding system of this category is always ASCII compatible. */
1930 src += coding->head_ascii;
1932 while (1)
1934 src_base = src;
1935 ONE_MORE_BYTE (c);
1936 if (c < 0)
1937 continue;
1938 if (c == 0x80)
1940 /* Perhaps the start of composite character. We simply skip
1941 it because analyzing it is too heavy for detecting. But,
1942 at least, we check that the composite character
1943 constitutes of more than 4 bytes. */
1944 const unsigned char *src_start;
1946 repeat:
1947 src_start = src;
1950 ONE_MORE_BYTE (c);
1952 while (c >= 0xA0);
1954 if (src - src_start <= 4)
1955 break;
1956 found = CATEGORY_MASK_EMACS_MULE;
1957 if (c == 0x80)
1958 goto repeat;
1961 if (c < 0x80)
1963 if (c < 0x20
1964 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965 break;
1967 else
1969 int more_bytes = emacs_mule_bytes[c] - 1;
1971 while (more_bytes > 0)
1973 ONE_MORE_BYTE (c);
1974 if (c < 0xA0)
1976 src--; /* Unread the last byte. */
1977 break;
1979 more_bytes--;
1981 if (more_bytes != 0)
1982 break;
1983 found = CATEGORY_MASK_EMACS_MULE;
1986 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987 return 0;
1989 no_more_source:
1990 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1992 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993 return 0;
1995 detect_info->found |= found;
1996 return 1;
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001 character. If CMP_STATUS indicates that we must expect MSEQ or
2002 RULE described above, decode it and return the negative value of
2003 the decoded character or rule. If an invalid byte is found, return
2004 -1. If SRC is too short, return -2. */
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008 int *nbytes, int *nchars, int *id,
2009 struct composition_status *cmp_status)
2011 const unsigned char *src_end = coding->source + coding->src_bytes;
2012 const unsigned char *src_base = src;
2013 bool multibytep = coding->src_multibyte;
2014 int charset_ID;
2015 unsigned code;
2016 int c;
2017 ptrdiff_t consumed_chars = 0;
2018 bool mseq_found = 0;
2020 ONE_MORE_BYTE (c);
2021 if (c < 0)
2023 c = -c;
2024 charset_ID = emacs_mule_charset[0];
2026 else
2028 if (c >= 0xA0)
2030 if (cmp_status->state != COMPOSING_NO
2031 && cmp_status->old_form)
2033 if (cmp_status->state == COMPOSING_CHAR)
2035 if (c == 0xA0)
2037 ONE_MORE_BYTE (c);
2038 c -= 0x80;
2039 if (c < 0)
2040 goto invalid_code;
2042 else
2043 c -= 0x20;
2044 mseq_found = 1;
2046 else
2048 *nbytes = src - src_base;
2049 *nchars = consumed_chars;
2050 return -c;
2053 else
2054 goto invalid_code;
2057 switch (emacs_mule_bytes[c])
2059 case 2:
2060 if ((charset_ID = emacs_mule_charset[c]) < 0)
2061 goto invalid_code;
2062 ONE_MORE_BYTE (c);
2063 if (c < 0xA0)
2064 goto invalid_code;
2065 code = c & 0x7F;
2066 break;
2068 case 3:
2069 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2072 ONE_MORE_BYTE (c);
2073 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074 goto invalid_code;
2075 ONE_MORE_BYTE (c);
2076 if (c < 0xA0)
2077 goto invalid_code;
2078 code = c & 0x7F;
2080 else
2082 if ((charset_ID = emacs_mule_charset[c]) < 0)
2083 goto invalid_code;
2084 ONE_MORE_BYTE (c);
2085 if (c < 0xA0)
2086 goto invalid_code;
2087 code = (c & 0x7F) << 8;
2088 ONE_MORE_BYTE (c);
2089 if (c < 0xA0)
2090 goto invalid_code;
2091 code |= c & 0x7F;
2093 break;
2095 case 4:
2096 ONE_MORE_BYTE (c);
2097 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098 goto invalid_code;
2099 ONE_MORE_BYTE (c);
2100 if (c < 0xA0)
2101 goto invalid_code;
2102 code = (c & 0x7F) << 8;
2103 ONE_MORE_BYTE (c);
2104 if (c < 0xA0)
2105 goto invalid_code;
2106 code |= c & 0x7F;
2107 break;
2109 case 1:
2110 code = c;
2111 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2112 break;
2114 default:
2115 emacs_abort ();
2117 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118 CHARSET_FROM_ID (charset_ID), code, c);
2119 if (c < 0)
2120 goto invalid_code;
2122 *nbytes = src - src_base;
2123 *nchars = consumed_chars;
2124 if (id)
2125 *id = charset_ID;
2126 return (mseq_found ? -c : c);
2128 no_more_source:
2129 return -2;
2131 invalid_code:
2132 return -1;
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2138 /* Handle these composition sequence ('|': the end of header elements,
2139 BYTES and CHARS >= 0xA0):
2141 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2145 and these old form:
2147 (4) relative composition: 0x80 | MSEQ ... MSEQ
2148 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2150 When the starter 0x80 and the following header elements are found,
2151 this annotation header is produced.
2153 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2155 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2158 Then, upon reading the following elements, these codes are produced
2159 until the composition end is found:
2161 (1) CHAR ... CHAR
2162 (2) ALT ... ALT CHAR ... CHAR
2163 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164 (4) CHAR ... CHAR
2165 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2167 When the composition end is found, LENGTH and NCHARS in the
2168 annotation header is updated as below:
2170 (1) LENGTH: unchanged, NCHARS: unchanged
2171 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173 (4) LENGTH: unchanged, NCHARS: number of CHARs
2174 (5) LENGTH: unchanged, NCHARS: number of CHARs
2176 If an error is found while composing, the annotation header is
2177 changed to the original composition header (plus filler -1s) as
2178 below:
2180 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181 (5) [ 0x80 0xFF -1 -1- -1 ]
2183 and the sequence [ -2 DECODED-RULE ] is changed to the original
2184 byte sequence as below:
2185 o the original byte sequence is B: [ B -1 ]
2186 o the original byte sequence is B1 B2: [ B1 B2 ]
2188 Most of the routines are implemented by macros because many
2189 variables and labels in the caller decode_coding_emacs_mule must be
2190 accessible, and they are usually called just once (thus doesn't
2191 increase the size of compiled object). */
2193 /* Decode a composition rule represented by C as a component of
2194 composition sequence of Emacs 20 style. Set RULE to the decoded
2195 rule. */
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2198 do { \
2199 int gref, nref; \
2201 c -= 0xA0; \
2202 if (c < 0 || c >= 81) \
2203 goto invalid_code; \
2204 gref = c / 9, nref = c % 9; \
2205 if (gref == 4) gref = 10; \
2206 if (nref == 4) nref = 10; \
2207 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2208 } while (0)
2211 /* Decode a composition rule represented by C and the following byte
2212 at SRC as a component of composition sequence of Emacs 21 style.
2213 Set RULE to the decoded rule. */
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2216 do { \
2217 int gref, nref; \
2219 gref = c - 0x20; \
2220 if (gref < 0 || gref >= 81) \
2221 goto invalid_code; \
2222 ONE_MORE_BYTE (c); \
2223 nref = c - 0x20; \
2224 if (nref < 0 || nref >= 81) \
2225 goto invalid_code; \
2226 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2227 } while (0)
2230 /* Start of Emacs 21 style format. The first three bytes at SRC are
2231 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232 byte length of this composition information, CHARS is the number of
2233 characters composed by this composition. */
2235 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2236 do { \
2237 enum composition_method method = c - 0xF2; \
2238 int nbytes, nchars; \
2240 ONE_MORE_BYTE (c); \
2241 if (c < 0) \
2242 goto invalid_code; \
2243 nbytes = c - 0xA0; \
2244 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2245 goto invalid_code; \
2246 ONE_MORE_BYTE (c); \
2247 nchars = c - 0xA0; \
2248 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2249 goto invalid_code; \
2250 cmp_status->old_form = 0; \
2251 cmp_status->method = method; \
2252 if (method == COMPOSITION_RELATIVE) \
2253 cmp_status->state = COMPOSING_CHAR; \
2254 else \
2255 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2256 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2257 cmp_status->nchars = nchars; \
2258 cmp_status->ncomps = nbytes - 4; \
2259 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2260 } while (0)
2263 /* Start of Emacs 20 style format for relative composition. */
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2266 do { \
2267 cmp_status->old_form = 1; \
2268 cmp_status->method = COMPOSITION_RELATIVE; \
2269 cmp_status->state = COMPOSING_CHAR; \
2270 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2271 cmp_status->nchars = cmp_status->ncomps = 0; \
2272 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2273 } while (0)
2276 /* Start of Emacs 20 style format for rule-base composition. */
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2279 do { \
2280 cmp_status->old_form = 1; \
2281 cmp_status->method = COMPOSITION_WITH_RULE; \
2282 cmp_status->state = COMPOSING_CHAR; \
2283 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2284 cmp_status->nchars = cmp_status->ncomps = 0; \
2285 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2286 } while (0)
2289 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2290 do { \
2291 const unsigned char *current_src = src; \
2293 ONE_MORE_BYTE (c); \
2294 if (c < 0) \
2295 goto invalid_code; \
2296 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2297 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2298 DECODE_EMACS_MULE_21_COMPOSITION (); \
2299 else if (c < 0xA0) \
2300 goto invalid_code; \
2301 else if (c < 0xC0) \
2303 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2304 /* Re-read C as a composition component. */ \
2305 src = current_src; \
2307 else if (c == 0xFF) \
2308 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2309 else \
2310 goto invalid_code; \
2311 } while (0)
2313 #define EMACS_MULE_COMPOSITION_END() \
2314 do { \
2315 int idx = - cmp_status->length; \
2317 if (cmp_status->old_form) \
2318 charbuf[idx + 2] = cmp_status->nchars; \
2319 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2320 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2321 cmp_status->state = COMPOSING_NO; \
2322 } while (0)
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327 struct composition_status *cmp_status)
2329 int idx = - cmp_status->length;
2330 int new_chars;
2332 if (cmp_status->old_form && cmp_status->nchars > 0)
2334 charbuf[idx + 2] = cmp_status->nchars;
2335 new_chars = 0;
2336 if (cmp_status->method == COMPOSITION_WITH_RULE
2337 && cmp_status->state == COMPOSING_CHAR)
2339 /* The last rule was invalid. */
2340 int rule = charbuf[-1] + 0xA0;
2342 charbuf[-2] = BYTE8_TO_CHAR (rule);
2343 charbuf[-1] = -1;
2344 new_chars = 1;
2347 else
2349 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2351 if (cmp_status->method == COMPOSITION_WITH_RULE)
2353 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354 charbuf[idx++] = -3;
2355 charbuf[idx++] = 0;
2356 new_chars = 1;
2358 else
2360 int nchars = charbuf[idx + 1] + 0xA0;
2361 int nbytes = charbuf[idx + 2] + 0xA0;
2363 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366 charbuf[idx++] = -1;
2367 new_chars = 4;
2370 cmp_status->state = COMPOSING_NO;
2371 return new_chars;
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2375 do { \
2376 if (cmp_status->state != COMPOSING_NO) \
2377 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378 } while (0)
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2384 const unsigned char *src = coding->source + coding->consumed;
2385 const unsigned char *src_end = coding->source + coding->src_bytes;
2386 const unsigned char *src_base;
2387 int *charbuf = coding->charbuf + coding->charbuf_used;
2388 /* We may produce two annotations (charset and composition) in one
2389 loop and one more charset annotation at the end. */
2390 int *charbuf_end
2391 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392 /* We can produce up to 2 characters in a loop. */
2393 - 1;
2394 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395 bool multibytep = coding->src_multibyte;
2396 ptrdiff_t char_offset = coding->produced_char;
2397 ptrdiff_t last_offset = char_offset;
2398 int last_id = charset_ascii;
2399 bool eol_dos
2400 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401 int byte_after_cr = -1;
2402 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2404 if (cmp_status->state != COMPOSING_NO)
2406 int i;
2408 if (charbuf_end - charbuf < cmp_status->length)
2409 emacs_abort ();
2410 for (i = 0; i < cmp_status->length; i++)
2411 *charbuf++ = cmp_status->carryover[i];
2412 coding->annotated = 1;
2415 while (1)
2417 int c, id IF_LINT (= 0);
2419 src_base = src;
2420 consumed_chars_base = consumed_chars;
2422 if (charbuf >= charbuf_end)
2424 if (byte_after_cr >= 0)
2425 src_base--;
2426 break;
2429 if (byte_after_cr >= 0)
2430 c = byte_after_cr, byte_after_cr = -1;
2431 else
2432 ONE_MORE_BYTE (c);
2434 if (c < 0 || c == 0x80)
2436 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437 if (c < 0)
2439 *charbuf++ = -c;
2440 char_offset++;
2442 else
2443 DECODE_EMACS_MULE_COMPOSITION_START ();
2444 continue;
2447 if (c < 0x80)
2449 if (eol_dos && c == '\r')
2450 ONE_MORE_BYTE (byte_after_cr);
2451 id = charset_ascii;
2452 if (cmp_status->state != COMPOSING_NO)
2454 if (cmp_status->old_form)
2455 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457 cmp_status->ncomps--;
2460 else
2462 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463 /* emacs_mule_char can load a charset map from a file, which
2464 allocates a large structure and might cause buffer text
2465 to be relocated as result. Thus, we need to remember the
2466 original pointer to buffer text, and fix up all related
2467 pointers after the call. */
2468 const unsigned char *orig = coding->source;
2469 ptrdiff_t offset;
2471 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472 cmp_status);
2473 offset = coding->source - orig;
2474 if (offset)
2476 src += offset;
2477 src_base += offset;
2478 src_end += offset;
2480 if (c < 0)
2482 if (c == -1)
2483 goto invalid_code;
2484 if (c == -2)
2485 break;
2487 src = src_base + nbytes;
2488 consumed_chars = consumed_chars_base + nchars;
2489 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490 cmp_status->ncomps -= nchars;
2493 /* Now if C >= 0, we found a normally encoded character, if C <
2494 0, we found an old-style composition component character or
2495 rule. */
2497 if (cmp_status->state == COMPOSING_NO)
2499 if (last_id != id)
2501 if (last_id != charset_ascii)
2502 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503 last_id);
2504 last_id = id;
2505 last_offset = char_offset;
2507 *charbuf++ = c;
2508 char_offset++;
2510 else if (cmp_status->state == COMPOSING_CHAR)
2512 if (cmp_status->old_form)
2514 if (c >= 0)
2516 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517 *charbuf++ = c;
2518 char_offset++;
2520 else
2522 *charbuf++ = -c;
2523 cmp_status->nchars++;
2524 cmp_status->length++;
2525 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526 EMACS_MULE_COMPOSITION_END ();
2527 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528 cmp_status->state = COMPOSING_RULE;
2531 else
2533 *charbuf++ = c;
2534 cmp_status->length++;
2535 cmp_status->nchars--;
2536 if (cmp_status->nchars == 0)
2537 EMACS_MULE_COMPOSITION_END ();
2540 else if (cmp_status->state == COMPOSING_RULE)
2542 int rule;
2544 if (c >= 0)
2546 EMACS_MULE_COMPOSITION_END ();
2547 *charbuf++ = c;
2548 char_offset++;
2550 else
2552 c = -c;
2553 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554 if (rule < 0)
2555 goto invalid_code;
2556 *charbuf++ = -2;
2557 *charbuf++ = rule;
2558 cmp_status->length += 2;
2559 cmp_status->state = COMPOSING_CHAR;
2562 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2564 *charbuf++ = c;
2565 cmp_status->length++;
2566 if (cmp_status->ncomps == 0)
2567 cmp_status->state = COMPOSING_CHAR;
2568 else if (cmp_status->ncomps > 0)
2570 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571 cmp_status->state = COMPOSING_COMPONENT_RULE;
2573 else
2574 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2576 else /* COMPOSING_COMPONENT_RULE */
2578 int rule;
2580 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581 if (rule < 0)
2582 goto invalid_code;
2583 *charbuf++ = -2;
2584 *charbuf++ = rule;
2585 cmp_status->length += 2;
2586 cmp_status->ncomps--;
2587 if (cmp_status->ncomps > 0)
2588 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589 else
2590 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2592 continue;
2594 invalid_code:
2595 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596 src = src_base;
2597 consumed_chars = consumed_chars_base;
2598 ONE_MORE_BYTE (c);
2599 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2600 char_offset++;
2601 coding->errors++;
2604 no_more_source:
2605 if (cmp_status->state != COMPOSING_NO)
2607 if (coding->mode & CODING_MODE_LAST_BLOCK)
2608 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609 else
2611 int i;
2613 charbuf -= cmp_status->length;
2614 for (i = 0; i < cmp_status->length; i++)
2615 cmp_status->carryover[i] = charbuf[i];
2618 if (last_id != charset_ascii)
2619 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620 coding->consumed_char += consumed_chars_base;
2621 coding->consumed = src_base - coding->source;
2622 coding->charbuf_used = charbuf - coding->charbuf;
2626 #define EMACS_MULE_LEADING_CODES(id, codes) \
2627 do { \
2628 if (id < 0xA0) \
2629 codes[0] = id, codes[1] = 0; \
2630 else if (id < 0xE0) \
2631 codes[0] = 0x9A, codes[1] = id; \
2632 else if (id < 0xF0) \
2633 codes[0] = 0x9B, codes[1] = id; \
2634 else if (id < 0xF5) \
2635 codes[0] = 0x9C, codes[1] = id; \
2636 else \
2637 codes[0] = 0x9D, codes[1] = id; \
2638 } while (0);
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2644 bool multibytep = coding->dst_multibyte;
2645 int *charbuf = coding->charbuf;
2646 int *charbuf_end = charbuf + coding->charbuf_used;
2647 unsigned char *dst = coding->destination + coding->produced;
2648 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649 int safe_room = 8;
2650 ptrdiff_t produced_chars = 0;
2651 Lisp_Object attrs, charset_list;
2652 int c;
2653 int preferred_charset_id = -1;
2655 CODING_GET_INFO (coding, attrs, charset_list);
2656 if (! EQ (charset_list, Vemacs_mule_charset_list))
2658 charset_list = Vemacs_mule_charset_list;
2659 ASET (attrs, coding_attr_charset_list, charset_list);
2662 while (charbuf < charbuf_end)
2664 ASSURE_DESTINATION (safe_room);
2665 c = *charbuf++;
2667 if (c < 0)
2669 /* Handle an annotation. */
2670 switch (*charbuf)
2672 case CODING_ANNOTATE_COMPOSITION_MASK:
2673 /* Not yet implemented. */
2674 break;
2675 case CODING_ANNOTATE_CHARSET_MASK:
2676 preferred_charset_id = charbuf[3];
2677 if (preferred_charset_id >= 0
2678 && NILP (Fmemq (make_number (preferred_charset_id),
2679 charset_list)))
2680 preferred_charset_id = -1;
2681 break;
2682 default:
2683 emacs_abort ();
2685 charbuf += -c - 1;
2686 continue;
2689 if (ASCII_CHAR_P (c))
2690 EMIT_ONE_ASCII_BYTE (c);
2691 else if (CHAR_BYTE8_P (c))
2693 c = CHAR_TO_BYTE8 (c);
2694 EMIT_ONE_BYTE (c);
2696 else
2698 struct charset *charset;
2699 unsigned code;
2700 int dimension;
2701 int emacs_mule_id;
2702 unsigned char leading_codes[2];
2704 if (preferred_charset_id >= 0)
2706 bool result;
2708 charset = CHARSET_FROM_ID (preferred_charset_id);
2709 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710 if (result)
2711 code = ENCODE_CHAR (charset, c);
2712 else
2713 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714 &code, charset);
2716 else
2717 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718 &code, charset);
2719 if (! charset)
2721 c = coding->default_char;
2722 if (ASCII_CHAR_P (c))
2724 EMIT_ONE_ASCII_BYTE (c);
2725 continue;
2727 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728 &code, charset);
2730 dimension = CHARSET_DIMENSION (charset);
2731 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733 EMIT_ONE_BYTE (leading_codes[0]);
2734 if (leading_codes[1])
2735 EMIT_ONE_BYTE (leading_codes[1]);
2736 if (dimension == 1)
2737 EMIT_ONE_BYTE (code | 0x80);
2738 else
2740 code |= 0x8080;
2741 EMIT_ONE_BYTE (code >> 8);
2742 EMIT_ONE_BYTE (code & 0xFF);
2746 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747 coding->produced_char += produced_chars;
2748 coding->produced = dst - coding->destination;
2749 return 0;
2753 /*** 7. ISO2022 handlers ***/
2755 /* The following note describes the coding system ISO2022 briefly.
2756 Since the intention of this note is to help understand the
2757 functions in this file, some parts are NOT ACCURATE or are OVERLY
2758 SIMPLIFIED. For thorough understanding, please refer to the
2759 original document of ISO2022. This is equivalent to the standard
2760 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2762 ISO2022 provides many mechanisms to encode several character sets
2763 in 7-bit and 8-bit environments. For 7-bit environments, all text
2764 is encoded using bytes less than 128. This may make the encoded
2765 text a little bit longer, but the text passes more easily through
2766 several types of gateway, some of which strip off the MSB (Most
2767 Significant Bit).
2769 There are two kinds of character sets: control character sets and
2770 graphic character sets. The former contain control characters such
2771 as `newline' and `escape' to provide control functions (control
2772 functions are also provided by escape sequences). The latter
2773 contain graphic characters such as 'A' and '-'. Emacs recognizes
2774 two control character sets and many graphic character sets.
2776 Graphic character sets are classified into one of the following
2777 four classes, according to the number of bytes (DIMENSION) and
2778 number of characters in one dimension (CHARS) of the set:
2779 - DIMENSION1_CHARS94
2780 - DIMENSION1_CHARS96
2781 - DIMENSION2_CHARS94
2782 - DIMENSION2_CHARS96
2784 In addition, each character set is assigned an identification tag,
2785 unique for each set, called the "final character" (denoted as <F>
2786 hereafter). The <F> of each character set is decided by ECMA(*)
2787 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2788 (0x30..0x3F are for private use only).
2790 Note (*): ECMA = European Computer Manufacturers Association
2792 Here are examples of graphic character sets [NAME(<F>)]:
2793 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796 o DIMENSION2_CHARS96 -- none for the moment
2798 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799 C0 [0x00..0x1F] -- control character plane 0
2800 GL [0x20..0x7F] -- graphic character plane 0
2801 C1 [0x80..0x9F] -- control character plane 1
2802 GR [0xA0..0xFF] -- graphic character plane 1
2804 A control character set is directly designated and invoked to C0 or
2805 C1 by an escape sequence. The most common case is that:
2806 - ISO646's control character set is designated/invoked to C0, and
2807 - ISO6429's control character set is designated/invoked to C1,
2808 and usually these designations/invocations are omitted in encoded
2809 text. In a 7-bit environment, only C0 can be used, and a control
2810 character for C1 is encoded by an appropriate escape sequence to
2811 fit into the environment. All control characters for C1 are
2812 defined to have corresponding escape sequences.
2814 A graphic character set is at first designated to one of four
2815 graphic registers (G0 through G3), then these graphic registers are
2816 invoked to GL or GR. These designations and invocations can be
2817 done independently. The most common case is that G0 is invoked to
2818 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2819 these invocations and designations are omitted in encoded text.
2820 In a 7-bit environment, only GL can be used.
2822 When a graphic character set of CHARS94 is invoked to GL, codes
2823 0x20 and 0x7F of the GL area work as control characters SPACE and
2824 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825 be used.
2827 There are two ways of invocation: locking-shift and single-shift.
2828 With locking-shift, the invocation lasts until the next different
2829 invocation, whereas with single-shift, the invocation affects the
2830 following character only and doesn't affect the locking-shift
2831 state. Invocations are done by the following control characters or
2832 escape sequences:
2834 ----------------------------------------------------------------------
2835 abbrev function cntrl escape seq description
2836 ----------------------------------------------------------------------
2837 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2838 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2839 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2840 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2841 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2842 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2843 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2844 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2845 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2846 ----------------------------------------------------------------------
2847 (*) These are not used by any known coding system.
2849 Control characters for these functions are defined by macros
2850 ISO_CODE_XXX in `coding.h'.
2852 Designations are done by the following escape sequences:
2853 ----------------------------------------------------------------------
2854 escape sequence description
2855 ----------------------------------------------------------------------
2856 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2857 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2858 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2859 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2860 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2861 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2862 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2863 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2864 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2865 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2866 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2867 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2868 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2869 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2870 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2871 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2872 ----------------------------------------------------------------------
2874 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875 of dimension 1, chars 94, and final character <F>, etc...
2877 Note (*): Although these designations are not allowed in ISO2022,
2878 Emacs accepts them on decoding, and produces them on encoding
2879 CHARS96 character sets in a coding system which is characterized as
2880 7-bit environment, non-locking-shift, and non-single-shift.
2882 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883 '(' must be omitted. We refer to this as "short-form" hereafter.
2885 Now you may notice that there are a lot of ways of encoding the
2886 same multilingual text in ISO2022. Actually, there exist many
2887 coding systems such as Compound Text (used in X11's inter client
2888 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890 localized platforms), and all of these are variants of ISO2022.
2892 In addition to the above, Emacs handles two more kinds of escape
2893 sequences: ISO6429's direction specification and Emacs' private
2894 sequence for specifying character composition.
2896 ISO6429's direction specification takes the following form:
2897 o CSI ']' -- end of the current direction
2898 o CSI '0' ']' -- end of the current direction
2899 o CSI '1' ']' -- start of left-to-right text
2900 o CSI '2' ']' -- start of right-to-left text
2901 The control character CSI (0x9B: control sequence introducer) is
2902 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2904 Character composition specification takes the following form:
2905 o ESC '0' -- start relative composition
2906 o ESC '1' -- end composition
2907 o ESC '2' -- start rule-base composition (*)
2908 o ESC '3' -- start relative composition with alternate chars (**)
2909 o ESC '4' -- start rule-base composition with alternate chars (**)
2910 Since these are not standard escape sequences of any ISO standard,
2911 the use of them with these meanings is restricted to Emacs only.
2913 (*) This form is used only in Emacs 20.7 and older versions,
2914 but newer versions can safely decode it.
2915 (**) This form is used only in Emacs 21.1 and newer versions,
2916 and older versions can't decode it.
2918 Here's a list of example usages of these composition escape
2919 sequences (categorized by `enum composition_method').
2921 COMPOSITION_RELATIVE:
2922 ESC 0 CHAR [ CHAR ] ESC 1
2923 COMPOSITION_WITH_RULE:
2924 ESC 2 CHAR [ RULE CHAR ] ESC 1
2925 COMPOSITION_WITH_ALTCHARS:
2926 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927 COMPOSITION_WITH_RULE_ALTCHARS:
2928 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2930 static enum iso_code_class_type iso_code_class[256];
2932 #define SAFE_CHARSET_P(coding, id) \
2933 ((id) <= (coding)->max_charset_id \
2934 && (coding)->safe_charsets[id] != 255)
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2939 Lisp_Object charset_list, safe_charsets;
2940 Lisp_Object request;
2941 Lisp_Object reg_usage;
2942 Lisp_Object tail;
2943 EMACS_INT reg94, reg96;
2944 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945 int max_charset_id;
2947 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949 && ! EQ (charset_list, Viso_2022_charset_list))
2951 charset_list = Viso_2022_charset_list;
2952 ASET (attrs, coding_attr_charset_list, charset_list);
2953 ASET (attrs, coding_attr_safe_charsets, Qnil);
2956 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957 return;
2959 max_charset_id = 0;
2960 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2962 int id = XINT (XCAR (tail));
2963 if (max_charset_id < id)
2964 max_charset_id = id;
2967 safe_charsets = make_uninit_string (max_charset_id + 1);
2968 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969 request = AREF (attrs, coding_attr_iso_request);
2970 reg_usage = AREF (attrs, coding_attr_iso_usage);
2971 reg94 = XINT (XCAR (reg_usage));
2972 reg96 = XINT (XCDR (reg_usage));
2974 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2976 Lisp_Object id;
2977 Lisp_Object reg;
2978 struct charset *charset;
2980 id = XCAR (tail);
2981 charset = CHARSET_FROM_ID (XINT (id));
2982 reg = Fcdr (Fassq (id, request));
2983 if (! NILP (reg))
2984 SSET (safe_charsets, XINT (id), XINT (reg));
2985 else if (charset->iso_chars_96)
2987 if (reg96 < 4)
2988 SSET (safe_charsets, XINT (id), reg96);
2990 else
2992 if (reg94 < 4)
2993 SSET (safe_charsets, XINT (id), reg94);
2996 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001 Return true if a text is encoded in one of ISO-2022 based coding
3002 systems. */
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006 struct coding_detection_info *detect_info)
3008 const unsigned char *src = coding->source, *src_base = src;
3009 const unsigned char *src_end = coding->source + coding->src_bytes;
3010 bool multibytep = coding->src_multibyte;
3011 bool single_shifting = 0;
3012 int id;
3013 int c, c1;
3014 ptrdiff_t consumed_chars = 0;
3015 int i;
3016 int rejected = 0;
3017 int found = 0;
3018 int composition_count = -1;
3020 detect_info->checked |= CATEGORY_MASK_ISO;
3022 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3024 struct coding_system *this = &(coding_categories[i]);
3025 Lisp_Object attrs, val;
3027 if (this->id < 0)
3028 continue;
3029 attrs = CODING_ID_ATTRS (this->id);
3030 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032 setup_iso_safe_charsets (attrs);
3033 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034 this->max_charset_id = SCHARS (val) - 1;
3035 this->safe_charsets = SDATA (val);
3038 /* A coding system of this category is always ASCII compatible. */
3039 src += coding->head_ascii;
3041 while (rejected != CATEGORY_MASK_ISO)
3043 src_base = src;
3044 ONE_MORE_BYTE (c);
3045 switch (c)
3047 case ISO_CODE_ESC:
3048 if (inhibit_iso_escape_detection)
3049 break;
3050 single_shifting = 0;
3051 ONE_MORE_BYTE (c);
3052 if (c == 'N' || c == 'O')
3054 /* ESC <Fe> for SS2 or SS3. */
3055 single_shifting = 1;
3056 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3058 else if (c == '1')
3060 /* End of composition. */
3061 if (composition_count < 0
3062 || composition_count > MAX_COMPOSITION_COMPONENTS)
3063 /* Invalid */
3064 break;
3065 composition_count = -1;
3066 found |= CATEGORY_MASK_ISO;
3068 else if (c >= '0' && c <= '4')
3070 /* ESC <Fp> for start/end composition. */
3071 composition_count = 0;
3073 else
3075 if (c >= '(' && c <= '/')
3077 /* Designation sequence for a charset of dimension 1. */
3078 ONE_MORE_BYTE (c1);
3079 if (c1 < ' ' || c1 >= 0x80
3080 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3082 /* Invalid designation sequence. Just ignore. */
3083 if (c1 >= 0x80)
3084 rejected |= (CATEGORY_MASK_ISO_7BIT
3085 | CATEGORY_MASK_ISO_7_ELSE);
3086 break;
3089 else if (c == '$')
3091 /* Designation sequence for a charset of dimension 2. */
3092 ONE_MORE_BYTE (c);
3093 if (c >= '@' && c <= 'B')
3094 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3095 id = iso_charset_table[1][0][c];
3096 else if (c >= '(' && c <= '/')
3098 ONE_MORE_BYTE (c1);
3099 if (c1 < ' ' || c1 >= 0x80
3100 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3102 /* Invalid designation sequence. Just ignore. */
3103 if (c1 >= 0x80)
3104 rejected |= (CATEGORY_MASK_ISO_7BIT
3105 | CATEGORY_MASK_ISO_7_ELSE);
3106 break;
3109 else
3111 /* Invalid designation sequence. Just ignore it. */
3112 if (c >= 0x80)
3113 rejected |= (CATEGORY_MASK_ISO_7BIT
3114 | CATEGORY_MASK_ISO_7_ELSE);
3115 break;
3118 else
3120 /* Invalid escape sequence. Just ignore it. */
3121 if (c >= 0x80)
3122 rejected |= (CATEGORY_MASK_ISO_7BIT
3123 | CATEGORY_MASK_ISO_7_ELSE);
3124 break;
3127 /* We found a valid designation sequence for CHARSET. */
3128 rejected |= CATEGORY_MASK_ISO_8BIT;
3129 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3130 id))
3131 found |= CATEGORY_MASK_ISO_7;
3132 else
3133 rejected |= CATEGORY_MASK_ISO_7;
3134 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3135 id))
3136 found |= CATEGORY_MASK_ISO_7_TIGHT;
3137 else
3138 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3139 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3140 id))
3141 found |= CATEGORY_MASK_ISO_7_ELSE;
3142 else
3143 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3144 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3145 id))
3146 found |= CATEGORY_MASK_ISO_8_ELSE;
3147 else
3148 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150 break;
3152 case ISO_CODE_SO:
3153 case ISO_CODE_SI:
3154 /* Locking shift out/in. */
3155 if (inhibit_iso_escape_detection)
3156 break;
3157 single_shifting = 0;
3158 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3159 break;
3161 case ISO_CODE_CSI:
3162 /* Control sequence introducer. */
3163 single_shifting = 0;
3164 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3165 found |= CATEGORY_MASK_ISO_8_ELSE;
3166 goto check_extra_latin;
3168 case ISO_CODE_SS2:
3169 case ISO_CODE_SS3:
3170 /* Single shift. */
3171 if (inhibit_iso_escape_detection)
3172 break;
3173 single_shifting = 0;
3174 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3175 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3176 & CODING_ISO_FLAG_SINGLE_SHIFT)
3178 found |= CATEGORY_MASK_ISO_8_1;
3179 single_shifting = 1;
3181 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3182 & CODING_ISO_FLAG_SINGLE_SHIFT)
3184 found |= CATEGORY_MASK_ISO_8_2;
3185 single_shifting = 1;
3187 if (single_shifting)
3188 break;
3189 goto check_extra_latin;
3191 default:
3192 if (c < 0)
3193 continue;
3194 if (c < 0x80)
3196 if (composition_count >= 0)
3197 composition_count++;
3198 single_shifting = 0;
3199 break;
3201 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3202 if (c >= 0xA0)
3204 found |= CATEGORY_MASK_ISO_8_1;
3205 /* Check the length of succeeding codes of the range
3206 0xA0..0FF. If the byte length is even, we include
3207 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3208 only when we are not single shifting. */
3209 if (! single_shifting
3210 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3212 ptrdiff_t len = 1;
3213 while (src < src_end)
3215 src_base = src;
3216 ONE_MORE_BYTE (c);
3217 if (c < 0xA0)
3219 src = src_base;
3220 break;
3222 len++;
3225 if (len & 1 && src < src_end)
3227 rejected |= CATEGORY_MASK_ISO_8_2;
3228 if (composition_count >= 0)
3229 composition_count += len;
3231 else
3233 found |= CATEGORY_MASK_ISO_8_2;
3234 if (composition_count >= 0)
3235 composition_count += len / 2;
3238 break;
3240 check_extra_latin:
3241 if (! VECTORP (Vlatin_extra_code_table)
3242 || NILP (AREF (Vlatin_extra_code_table, c)))
3244 rejected = CATEGORY_MASK_ISO;
3245 break;
3247 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3248 & CODING_ISO_FLAG_LATIN_EXTRA)
3249 found |= CATEGORY_MASK_ISO_8_1;
3250 else
3251 rejected |= CATEGORY_MASK_ISO_8_1;
3252 rejected |= CATEGORY_MASK_ISO_8_2;
3253 break;
3256 detect_info->rejected |= CATEGORY_MASK_ISO;
3257 return 0;
3259 no_more_source:
3260 detect_info->rejected |= rejected;
3261 detect_info->found |= (found & ~rejected);
3262 return 1;
3266 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3267 escape sequence should be kept. */
3268 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3269 do { \
3270 int id, prev; \
3272 if (final < '0' || final >= 128 \
3273 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3274 || !SAFE_CHARSET_P (coding, id)) \
3276 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3277 chars_96 = -1; \
3278 break; \
3280 prev = CODING_ISO_DESIGNATION (coding, reg); \
3281 if (id == charset_jisx0201_roman) \
3283 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3284 id = charset_ascii; \
3286 else if (id == charset_jisx0208_1978) \
3288 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3289 id = charset_jisx0208; \
3291 CODING_ISO_DESIGNATION (coding, reg) = id; \
3292 /* If there was an invalid designation to REG previously, and this \
3293 designation is ASCII to REG, we should keep this designation \
3294 sequence. */ \
3295 if (prev == -2 && id == charset_ascii) \
3296 chars_96 = -1; \
3297 } while (0)
3300 /* Handle these composition sequence (ALT: alternate char):
3302 (1) relative composition: ESC 0 CHAR ... ESC 1
3303 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3307 When the start sequence (ESC 0/2/3/4) is found, this annotation
3308 header is produced.
3310 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3312 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313 produced until the end sequence (ESC 1) is found:
3315 (1) CHAR ... CHAR
3316 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3320 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321 annotation header is updated as below:
3323 (1) LENGTH: unchanged, NCHARS: number of CHARs
3324 (2) LENGTH: unchanged, NCHARS: number of CHARs
3325 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3326 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3328 If an error is found while composing, the annotation header is
3329 changed to:
3331 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3333 and the sequence [ -2 DECODED-RULE ] is changed to the original
3334 byte sequence as below:
3335 o the original byte sequence is B: [ B -1 ]
3336 o the original byte sequence is B1 B2: [ B1 B2 ]
3337 and the sequence [ -1 -1 ] is changed to the original byte
3338 sequence:
3339 [ ESC '0' ]
3342 /* Decode a composition rule C1 and maybe one more byte from the
3343 source, and set RULE to the encoded composition rule. If the rule
3344 is invalid, goto invalid_code. */
3346 #define DECODE_COMPOSITION_RULE(rule) \
3347 do { \
3348 rule = c1 - 32; \
3349 if (rule < 0) \
3350 goto invalid_code; \
3351 if (rule < 81) /* old format (before ver.21) */ \
3353 int gref = (rule) / 9; \
3354 int nref = (rule) % 9; \
3355 if (gref == 4) gref = 10; \
3356 if (nref == 4) nref = 10; \
3357 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3359 else /* new format (after ver.21) */ \
3361 int b; \
3363 ONE_MORE_BYTE (b); \
3364 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3365 goto invalid_code; \
3366 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3367 rule += 0x100; /* Distinguish it from the old format. */ \
3369 } while (0)
3371 #define ENCODE_COMPOSITION_RULE(rule) \
3372 do { \
3373 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3375 if (rule < 0x100) /* old format */ \
3377 if (gref == 10) gref = 4; \
3378 if (nref == 10) nref = 4; \
3379 charbuf[idx] = 32 + gref * 9 + nref; \
3380 charbuf[idx + 1] = -1; \
3381 new_chars++; \
3383 else /* new format */ \
3385 charbuf[idx] = 32 + 81 + gref; \
3386 charbuf[idx + 1] = 32 + nref; \
3387 new_chars += 2; \
3389 } while (0)
3391 /* Finish the current composition as invalid. */
3393 static int
3394 finish_composition (int *charbuf, struct composition_status *cmp_status)
3396 int idx = - cmp_status->length;
3397 int new_chars;
3399 /* Recover the original ESC sequence */
3400 charbuf[idx++] = ISO_CODE_ESC;
3401 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3402 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3403 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3404 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3405 : '4');
3406 charbuf[idx++] = -2;
3407 charbuf[idx++] = 0;
3408 charbuf[idx++] = -1;
3409 new_chars = cmp_status->nchars;
3410 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3411 for (; idx < 0; idx++)
3413 int elt = charbuf[idx];
3415 if (elt == -2)
3417 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3418 idx++;
3420 else if (elt == -1)
3422 charbuf[idx++] = ISO_CODE_ESC;
3423 charbuf[idx] = '0';
3424 new_chars += 2;
3427 cmp_status->state = COMPOSING_NO;
3428 return new_chars;
3431 /* If characters are under composition, finish the composition. */
3432 #define MAYBE_FINISH_COMPOSITION() \
3433 do { \
3434 if (cmp_status->state != COMPOSING_NO) \
3435 char_offset += finish_composition (charbuf, cmp_status); \
3436 } while (0)
3438 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3440 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3441 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3442 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3443 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3445 Produce this annotation sequence now:
3447 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3450 #define DECODE_COMPOSITION_START(c1) \
3451 do { \
3452 if (c1 == '0' \
3453 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3454 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3455 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3456 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3458 *charbuf++ = -1; \
3459 *charbuf++= -1; \
3460 cmp_status->state = COMPOSING_CHAR; \
3461 cmp_status->length += 2; \
3463 else \
3465 MAYBE_FINISH_COMPOSITION (); \
3466 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3467 : c1 == '2' ? COMPOSITION_WITH_RULE \
3468 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3469 : COMPOSITION_WITH_RULE_ALTCHARS); \
3470 cmp_status->state \
3471 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3472 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3473 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3474 cmp_status->nchars = cmp_status->ncomps = 0; \
3475 coding->annotated = 1; \
3477 } while (0)
3480 /* Handle composition end sequence ESC 1. */
3482 #define DECODE_COMPOSITION_END() \
3483 do { \
3484 if (cmp_status->nchars == 0 \
3485 || ((cmp_status->state == COMPOSING_CHAR) \
3486 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3488 MAYBE_FINISH_COMPOSITION (); \
3489 goto invalid_code; \
3491 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3492 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3493 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3494 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3495 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3496 char_offset += cmp_status->nchars; \
3497 cmp_status->state = COMPOSING_NO; \
3498 } while (0)
3500 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3502 #define STORE_COMPOSITION_RULE(rule) \
3503 do { \
3504 *charbuf++ = -2; \
3505 *charbuf++ = rule; \
3506 cmp_status->length += 2; \
3507 cmp_status->state--; \
3508 } while (0)
3510 /* Store a composed char or a component char C in charbuf, and update
3511 cmp_status. */
3513 #define STORE_COMPOSITION_CHAR(c) \
3514 do { \
3515 *charbuf++ = (c); \
3516 cmp_status->length++; \
3517 if (cmp_status->state == COMPOSING_CHAR) \
3518 cmp_status->nchars++; \
3519 else \
3520 cmp_status->ncomps++; \
3521 if (cmp_status->method == COMPOSITION_WITH_RULE \
3522 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3523 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3524 cmp_status->state++; \
3525 } while (0)
3528 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3530 static void
3531 decode_coding_iso_2022 (struct coding_system *coding)
3533 const unsigned char *src = coding->source + coding->consumed;
3534 const unsigned char *src_end = coding->source + coding->src_bytes;
3535 const unsigned char *src_base;
3536 int *charbuf = coding->charbuf + coding->charbuf_used;
3537 /* We may produce two annotations (charset and composition) in one
3538 loop and one more charset annotation at the end. */
3539 int *charbuf_end
3540 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3541 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3542 bool multibytep = coding->src_multibyte;
3543 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3544 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3545 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3546 int charset_id_2, charset_id_3;
3547 struct charset *charset;
3548 int c;
3549 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3550 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3551 ptrdiff_t char_offset = coding->produced_char;
3552 ptrdiff_t last_offset = char_offset;
3553 int last_id = charset_ascii;
3554 bool eol_dos
3555 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3556 int byte_after_cr = -1;
3557 int i;
3559 setup_iso_safe_charsets (attrs);
3560 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3562 if (cmp_status->state != COMPOSING_NO)
3564 if (charbuf_end - charbuf < cmp_status->length)
3565 emacs_abort ();
3566 for (i = 0; i < cmp_status->length; i++)
3567 *charbuf++ = cmp_status->carryover[i];
3568 coding->annotated = 1;
3571 while (1)
3573 int c1, c2, c3;
3575 src_base = src;
3576 consumed_chars_base = consumed_chars;
3578 if (charbuf >= charbuf_end)
3580 if (byte_after_cr >= 0)
3581 src_base--;
3582 break;
3585 if (byte_after_cr >= 0)
3586 c1 = byte_after_cr, byte_after_cr = -1;
3587 else
3588 ONE_MORE_BYTE (c1);
3589 if (c1 < 0)
3590 goto invalid_code;
3592 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3594 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3595 char_offset++;
3596 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3597 continue;
3600 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3602 if (c1 == ISO_CODE_ESC)
3604 if (src + 1 >= src_end)
3605 goto no_more_source;
3606 *charbuf++ = ISO_CODE_ESC;
3607 char_offset++;
3608 if (src[0] == '%' && src[1] == '@')
3610 src += 2;
3611 consumed_chars += 2;
3612 char_offset += 2;
3613 /* We are sure charbuf can contain two more chars. */
3614 *charbuf++ = '%';
3615 *charbuf++ = '@';
3616 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3619 else
3621 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3622 char_offset++;
3624 continue;
3627 if ((cmp_status->state == COMPOSING_RULE
3628 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3629 && c1 != ISO_CODE_ESC)
3631 int rule;
3633 DECODE_COMPOSITION_RULE (rule);
3634 STORE_COMPOSITION_RULE (rule);
3635 continue;
3638 /* We produce at most one character. */
3639 switch (iso_code_class [c1])
3641 case ISO_0x20_or_0x7F:
3642 if (charset_id_0 < 0
3643 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3644 /* This is SPACE or DEL. */
3645 charset = CHARSET_FROM_ID (charset_ascii);
3646 else
3647 charset = CHARSET_FROM_ID (charset_id_0);
3648 break;
3650 case ISO_graphic_plane_0:
3651 if (charset_id_0 < 0)
3652 charset = CHARSET_FROM_ID (charset_ascii);
3653 else
3654 charset = CHARSET_FROM_ID (charset_id_0);
3655 break;
3657 case ISO_0xA0_or_0xFF:
3658 if (charset_id_1 < 0
3659 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3660 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3661 goto invalid_code;
3662 /* This is a graphic character, we fall down ... */
3664 case ISO_graphic_plane_1:
3665 if (charset_id_1 < 0)
3666 goto invalid_code;
3667 charset = CHARSET_FROM_ID (charset_id_1);
3668 break;
3670 case ISO_control_0:
3671 if (eol_dos && c1 == '\r')
3672 ONE_MORE_BYTE (byte_after_cr);
3673 MAYBE_FINISH_COMPOSITION ();
3674 charset = CHARSET_FROM_ID (charset_ascii);
3675 break;
3677 case ISO_control_1:
3678 goto invalid_code;
3680 case ISO_shift_out:
3681 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3682 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3683 goto invalid_code;
3684 CODING_ISO_INVOCATION (coding, 0) = 1;
3685 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3686 continue;
3688 case ISO_shift_in:
3689 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3690 goto invalid_code;
3691 CODING_ISO_INVOCATION (coding, 0) = 0;
3692 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3693 continue;
3695 case ISO_single_shift_2_7:
3696 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3697 goto invalid_code;
3698 case ISO_single_shift_2:
3699 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3700 goto invalid_code;
3701 /* SS2 is handled as an escape sequence of ESC 'N' */
3702 c1 = 'N';
3703 goto label_escape_sequence;
3705 case ISO_single_shift_3:
3706 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3707 goto invalid_code;
3708 /* SS2 is handled as an escape sequence of ESC 'O' */
3709 c1 = 'O';
3710 goto label_escape_sequence;
3712 case ISO_control_sequence_introducer:
3713 /* CSI is handled as an escape sequence of ESC '[' ... */
3714 c1 = '[';
3715 goto label_escape_sequence;
3717 case ISO_escape:
3718 ONE_MORE_BYTE (c1);
3719 label_escape_sequence:
3720 /* Escape sequences handled here are invocation,
3721 designation, direction specification, and character
3722 composition specification. */
3723 switch (c1)
3725 case '&': /* revision of following character set */
3726 ONE_MORE_BYTE (c1);
3727 if (!(c1 >= '@' && c1 <= '~'))
3728 goto invalid_code;
3729 ONE_MORE_BYTE (c1);
3730 if (c1 != ISO_CODE_ESC)
3731 goto invalid_code;
3732 ONE_MORE_BYTE (c1);
3733 goto label_escape_sequence;
3735 case '$': /* designation of 2-byte character set */
3736 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3737 goto invalid_code;
3739 int reg, chars96;
3741 ONE_MORE_BYTE (c1);
3742 if (c1 >= '@' && c1 <= 'B')
3743 { /* designation of JISX0208.1978, GB2312.1980,
3744 or JISX0208.1980 */
3745 reg = 0, chars96 = 0;
3747 else if (c1 >= 0x28 && c1 <= 0x2B)
3748 { /* designation of DIMENSION2_CHARS94 character set */
3749 reg = c1 - 0x28, chars96 = 0;
3750 ONE_MORE_BYTE (c1);
3752 else if (c1 >= 0x2C && c1 <= 0x2F)
3753 { /* designation of DIMENSION2_CHARS96 character set */
3754 reg = c1 - 0x2C, chars96 = 1;
3755 ONE_MORE_BYTE (c1);
3757 else
3758 goto invalid_code;
3759 DECODE_DESIGNATION (reg, 2, chars96, c1);
3760 /* We must update these variables now. */
3761 if (reg == 0)
3762 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3763 else if (reg == 1)
3764 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3765 if (chars96 < 0)
3766 goto invalid_code;
3768 continue;
3770 case 'n': /* invocation of locking-shift-2 */
3771 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3772 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3773 goto invalid_code;
3774 CODING_ISO_INVOCATION (coding, 0) = 2;
3775 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3776 continue;
3778 case 'o': /* invocation of locking-shift-3 */
3779 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3780 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3781 goto invalid_code;
3782 CODING_ISO_INVOCATION (coding, 0) = 3;
3783 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3784 continue;
3786 case 'N': /* invocation of single-shift-2 */
3787 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3788 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3789 goto invalid_code;
3790 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3791 if (charset_id_2 < 0)
3792 charset = CHARSET_FROM_ID (charset_ascii);
3793 else
3794 charset = CHARSET_FROM_ID (charset_id_2);
3795 ONE_MORE_BYTE (c1);
3796 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3797 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3798 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3799 ? c1 >= 0x80 : c1 < 0x80)))
3800 goto invalid_code;
3801 break;
3803 case 'O': /* invocation of single-shift-3 */
3804 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3805 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3806 goto invalid_code;
3807 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3808 if (charset_id_3 < 0)
3809 charset = CHARSET_FROM_ID (charset_ascii);
3810 else
3811 charset = CHARSET_FROM_ID (charset_id_3);
3812 ONE_MORE_BYTE (c1);
3813 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3814 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3815 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3816 ? c1 >= 0x80 : c1 < 0x80)))
3817 goto invalid_code;
3818 break;
3820 case '0': case '2': case '3': case '4': /* start composition */
3821 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3822 goto invalid_code;
3823 if (last_id != charset_ascii)
3825 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3826 last_id = charset_ascii;
3827 last_offset = char_offset;
3829 DECODE_COMPOSITION_START (c1);
3830 continue;
3832 case '1': /* end composition */
3833 if (cmp_status->state == COMPOSING_NO)
3834 goto invalid_code;
3835 DECODE_COMPOSITION_END ();
3836 continue;
3838 case '[': /* specification of direction */
3839 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3840 goto invalid_code;
3841 /* For the moment, nested direction is not supported.
3842 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3843 left-to-right, and nonzero means right-to-left. */
3844 ONE_MORE_BYTE (c1);
3845 switch (c1)
3847 case ']': /* end of the current direction */
3848 coding->mode &= ~CODING_MODE_DIRECTION;
3850 case '0': /* end of the current direction */
3851 case '1': /* start of left-to-right direction */
3852 ONE_MORE_BYTE (c1);
3853 if (c1 == ']')
3854 coding->mode &= ~CODING_MODE_DIRECTION;
3855 else
3856 goto invalid_code;
3857 break;
3859 case '2': /* start of right-to-left direction */
3860 ONE_MORE_BYTE (c1);
3861 if (c1 == ']')
3862 coding->mode |= CODING_MODE_DIRECTION;
3863 else
3864 goto invalid_code;
3865 break;
3867 default:
3868 goto invalid_code;
3870 continue;
3872 case '%':
3873 ONE_MORE_BYTE (c1);
3874 if (c1 == '/')
3876 /* CTEXT extended segment:
3877 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3878 We keep these bytes as is for the moment.
3879 They may be decoded by post-read-conversion. */
3880 int dim, M, L;
3881 int size;
3883 ONE_MORE_BYTE (dim);
3884 if (dim < '0' || dim > '4')
3885 goto invalid_code;
3886 ONE_MORE_BYTE (M);
3887 if (M < 128)
3888 goto invalid_code;
3889 ONE_MORE_BYTE (L);
3890 if (L < 128)
3891 goto invalid_code;
3892 size = ((M - 128) * 128) + (L - 128);
3893 if (charbuf + 6 > charbuf_end)
3894 goto break_loop;
3895 *charbuf++ = ISO_CODE_ESC;
3896 *charbuf++ = '%';
3897 *charbuf++ = '/';
3898 *charbuf++ = dim;
3899 *charbuf++ = BYTE8_TO_CHAR (M);
3900 *charbuf++ = BYTE8_TO_CHAR (L);
3901 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3903 else if (c1 == 'G')
3905 /* XFree86 extension for embedding UTF-8 in CTEXT:
3906 ESC % G --UTF-8-BYTES-- ESC % @
3907 We keep these bytes as is for the moment.
3908 They may be decoded by post-read-conversion. */
3909 if (charbuf + 3 > charbuf_end)
3910 goto break_loop;
3911 *charbuf++ = ISO_CODE_ESC;
3912 *charbuf++ = '%';
3913 *charbuf++ = 'G';
3914 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3916 else
3917 goto invalid_code;
3918 continue;
3919 break;
3921 default:
3922 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3923 goto invalid_code;
3925 int reg, chars96;
3927 if (c1 >= 0x28 && c1 <= 0x2B)
3928 { /* designation of DIMENSION1_CHARS94 character set */
3929 reg = c1 - 0x28, chars96 = 0;
3930 ONE_MORE_BYTE (c1);
3932 else if (c1 >= 0x2C && c1 <= 0x2F)
3933 { /* designation of DIMENSION1_CHARS96 character set */
3934 reg = c1 - 0x2C, chars96 = 1;
3935 ONE_MORE_BYTE (c1);
3937 else
3938 goto invalid_code;
3939 DECODE_DESIGNATION (reg, 1, chars96, c1);
3940 /* We must update these variables now. */
3941 if (reg == 0)
3942 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3943 else if (reg == 1)
3944 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3945 if (chars96 < 0)
3946 goto invalid_code;
3948 continue;
3950 break;
3952 default:
3953 emacs_abort ();
3956 if (cmp_status->state == COMPOSING_NO
3957 && charset->id != charset_ascii
3958 && last_id != charset->id)
3960 if (last_id != charset_ascii)
3961 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3962 last_id = charset->id;
3963 last_offset = char_offset;
3966 /* Now we know CHARSET and 1st position code C1 of a character.
3967 Produce a decoded character while getting 2nd and 3rd
3968 position codes C2, C3 if necessary. */
3969 if (CHARSET_DIMENSION (charset) > 1)
3971 ONE_MORE_BYTE (c2);
3972 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3973 || ((c1 & 0x80) != (c2 & 0x80)))
3974 /* C2 is not in a valid range. */
3975 goto invalid_code;
3976 if (CHARSET_DIMENSION (charset) == 2)
3977 c1 = (c1 << 8) | c2;
3978 else
3980 ONE_MORE_BYTE (c3);
3981 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3982 || ((c1 & 0x80) != (c3 & 0x80)))
3983 /* C3 is not in a valid range. */
3984 goto invalid_code;
3985 c1 = (c1 << 16) | (c2 << 8) | c2;
3988 c1 &= 0x7F7F7F;
3989 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3990 if (c < 0)
3992 MAYBE_FINISH_COMPOSITION ();
3993 for (; src_base < src; src_base++, char_offset++)
3995 if (ASCII_BYTE_P (*src_base))
3996 *charbuf++ = *src_base;
3997 else
3998 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4001 else if (cmp_status->state == COMPOSING_NO)
4003 *charbuf++ = c;
4004 char_offset++;
4006 else if ((cmp_status->state == COMPOSING_CHAR
4007 ? cmp_status->nchars
4008 : cmp_status->ncomps)
4009 >= MAX_COMPOSITION_COMPONENTS)
4011 /* Too long composition. */
4012 MAYBE_FINISH_COMPOSITION ();
4013 *charbuf++ = c;
4014 char_offset++;
4016 else
4017 STORE_COMPOSITION_CHAR (c);
4018 continue;
4020 invalid_code:
4021 MAYBE_FINISH_COMPOSITION ();
4022 src = src_base;
4023 consumed_chars = consumed_chars_base;
4024 ONE_MORE_BYTE (c);
4025 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4026 char_offset++;
4027 coding->errors++;
4028 /* Reset the invocation and designation status to the safest
4029 one; i.e. designate ASCII to the graphic register 0, and
4030 invoke that register to the graphic plane 0. This typically
4031 helps the case that an designation sequence for ASCII "ESC (
4032 B" is somehow broken (e.g. broken by a newline). */
4033 CODING_ISO_INVOCATION (coding, 0) = 0;
4034 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4035 charset_id_0 = charset_ascii;
4036 continue;
4038 break_loop:
4039 break;
4042 no_more_source:
4043 if (cmp_status->state != COMPOSING_NO)
4045 if (coding->mode & CODING_MODE_LAST_BLOCK)
4046 MAYBE_FINISH_COMPOSITION ();
4047 else
4049 charbuf -= cmp_status->length;
4050 for (i = 0; i < cmp_status->length; i++)
4051 cmp_status->carryover[i] = charbuf[i];
4054 else if (last_id != charset_ascii)
4055 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4056 coding->consumed_char += consumed_chars_base;
4057 coding->consumed = src_base - coding->source;
4058 coding->charbuf_used = charbuf - coding->charbuf;
4062 /* ISO2022 encoding stuff. */
4065 It is not enough to say just "ISO2022" on encoding, we have to
4066 specify more details. In Emacs, each coding system of ISO2022
4067 variant has the following specifications:
4068 1. Initial designation to G0 thru G3.
4069 2. Allows short-form designation?
4070 3. ASCII should be designated to G0 before control characters?
4071 4. ASCII should be designated to G0 at end of line?
4072 5. 7-bit environment or 8-bit environment?
4073 6. Use locking-shift?
4074 7. Use Single-shift?
4075 And the following two are only for Japanese:
4076 8. Use ASCII in place of JIS0201-1976-Roman?
4077 9. Use JISX0208-1983 in place of JISX0208-1978?
4078 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4079 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4080 details.
4083 /* Produce codes (escape sequence) for designating CHARSET to graphic
4084 register REG at DST, and increment DST. If <final-char> of CHARSET is
4085 '@', 'A', or 'B' and the coding system CODING allows, produce
4086 designation sequence of short-form. */
4088 #define ENCODE_DESIGNATION(charset, reg, coding) \
4089 do { \
4090 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4091 const char *intermediate_char_94 = "()*+"; \
4092 const char *intermediate_char_96 = ",-./"; \
4093 int revision = -1; \
4095 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4096 revision = CHARSET_ISO_REVISION (charset); \
4098 if (revision >= 0) \
4100 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4101 EMIT_ONE_BYTE ('@' + revision); \
4103 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4104 if (CHARSET_DIMENSION (charset) == 1) \
4106 int b; \
4107 if (! CHARSET_ISO_CHARS_96 (charset)) \
4108 b = intermediate_char_94[reg]; \
4109 else \
4110 b = intermediate_char_96[reg]; \
4111 EMIT_ONE_ASCII_BYTE (b); \
4113 else \
4115 EMIT_ONE_ASCII_BYTE ('$'); \
4116 if (! CHARSET_ISO_CHARS_96 (charset)) \
4118 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4119 || reg != 0 \
4120 || final_char < '@' || final_char > 'B') \
4121 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4123 else \
4124 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4126 EMIT_ONE_ASCII_BYTE (final_char); \
4128 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4129 } while (0)
4132 /* The following two macros produce codes (control character or escape
4133 sequence) for ISO2022 single-shift functions (single-shift-2 and
4134 single-shift-3). */
4136 #define ENCODE_SINGLE_SHIFT_2 \
4137 do { \
4138 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4139 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4140 else \
4141 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4142 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4143 } while (0)
4146 #define ENCODE_SINGLE_SHIFT_3 \
4147 do { \
4148 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4149 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4150 else \
4151 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4152 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4153 } while (0)
4156 /* The following four macros produce codes (control character or
4157 escape sequence) for ISO2022 locking-shift functions (shift-in,
4158 shift-out, locking-shift-2, and locking-shift-3). */
4160 #define ENCODE_SHIFT_IN \
4161 do { \
4162 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4163 CODING_ISO_INVOCATION (coding, 0) = 0; \
4164 } while (0)
4167 #define ENCODE_SHIFT_OUT \
4168 do { \
4169 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4170 CODING_ISO_INVOCATION (coding, 0) = 1; \
4171 } while (0)
4174 #define ENCODE_LOCKING_SHIFT_2 \
4175 do { \
4176 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4177 CODING_ISO_INVOCATION (coding, 0) = 2; \
4178 } while (0)
4181 #define ENCODE_LOCKING_SHIFT_3 \
4182 do { \
4183 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4184 CODING_ISO_INVOCATION (coding, 0) = 3; \
4185 } while (0)
4188 /* Produce codes for a DIMENSION1 character whose character set is
4189 CHARSET and whose position-code is C1. Designation and invocation
4190 sequences are also produced in advance if necessary. */
4192 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4193 do { \
4194 int id = CHARSET_ID (charset); \
4196 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4197 && id == charset_ascii) \
4199 id = charset_jisx0201_roman; \
4200 charset = CHARSET_FROM_ID (id); \
4203 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4205 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4206 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4207 else \
4208 EMIT_ONE_BYTE (c1 | 0x80); \
4209 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4210 break; \
4212 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4214 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4215 break; \
4217 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4219 EMIT_ONE_BYTE (c1 | 0x80); \
4220 break; \
4222 else \
4223 /* Since CHARSET is not yet invoked to any graphic planes, we \
4224 must invoke it, or, at first, designate it to some graphic \
4225 register. Then repeat the loop to actually produce the \
4226 character. */ \
4227 dst = encode_invocation_designation (charset, coding, dst, \
4228 &produced_chars); \
4229 } while (1)
4232 /* Produce codes for a DIMENSION2 character whose character set is
4233 CHARSET and whose position-codes are C1 and C2. Designation and
4234 invocation codes are also produced in advance if necessary. */
4236 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4237 do { \
4238 int id = CHARSET_ID (charset); \
4240 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4241 && id == charset_jisx0208) \
4243 id = charset_jisx0208_1978; \
4244 charset = CHARSET_FROM_ID (id); \
4247 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4249 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4250 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4251 else \
4252 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4253 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4254 break; \
4256 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4258 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4259 break; \
4261 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4263 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4264 break; \
4266 else \
4267 /* Since CHARSET is not yet invoked to any graphic planes, we \
4268 must invoke it, or, at first, designate it to some graphic \
4269 register. Then repeat the loop to actually produce the \
4270 character. */ \
4271 dst = encode_invocation_designation (charset, coding, dst, \
4272 &produced_chars); \
4273 } while (1)
4276 #define ENCODE_ISO_CHARACTER(charset, c) \
4277 do { \
4278 unsigned code; \
4279 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4281 if (CHARSET_DIMENSION (charset) == 1) \
4282 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4283 else \
4284 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4285 } while (0)
4288 /* Produce designation and invocation codes at a place pointed by DST
4289 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4290 Return new DST. */
4292 static unsigned char *
4293 encode_invocation_designation (struct charset *charset,
4294 struct coding_system *coding,
4295 unsigned char *dst, ptrdiff_t *p_nchars)
4297 bool multibytep = coding->dst_multibyte;
4298 ptrdiff_t produced_chars = *p_nchars;
4299 int reg; /* graphic register number */
4300 int id = CHARSET_ID (charset);
4302 /* At first, check designations. */
4303 for (reg = 0; reg < 4; reg++)
4304 if (id == CODING_ISO_DESIGNATION (coding, reg))
4305 break;
4307 if (reg >= 4)
4309 /* CHARSET is not yet designated to any graphic registers. */
4310 /* At first check the requested designation. */
4311 reg = CODING_ISO_REQUEST (coding, id);
4312 if (reg < 0)
4313 /* Since CHARSET requests no special designation, designate it
4314 to graphic register 0. */
4315 reg = 0;
4317 ENCODE_DESIGNATION (charset, reg, coding);
4320 if (CODING_ISO_INVOCATION (coding, 0) != reg
4321 && CODING_ISO_INVOCATION (coding, 1) != reg)
4323 /* Since the graphic register REG is not invoked to any graphic
4324 planes, invoke it to graphic plane 0. */
4325 switch (reg)
4327 case 0: /* graphic register 0 */
4328 ENCODE_SHIFT_IN;
4329 break;
4331 case 1: /* graphic register 1 */
4332 ENCODE_SHIFT_OUT;
4333 break;
4335 case 2: /* graphic register 2 */
4336 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4337 ENCODE_SINGLE_SHIFT_2;
4338 else
4339 ENCODE_LOCKING_SHIFT_2;
4340 break;
4342 case 3: /* graphic register 3 */
4343 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4344 ENCODE_SINGLE_SHIFT_3;
4345 else
4346 ENCODE_LOCKING_SHIFT_3;
4347 break;
4351 *p_nchars = produced_chars;
4352 return dst;
4356 /* Produce codes for designation and invocation to reset the graphic
4357 planes and registers to initial state. */
4358 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4359 do { \
4360 int reg; \
4361 struct charset *charset; \
4363 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4364 ENCODE_SHIFT_IN; \
4365 for (reg = 0; reg < 4; reg++) \
4366 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4367 && (CODING_ISO_DESIGNATION (coding, reg) \
4368 != CODING_ISO_INITIAL (coding, reg))) \
4370 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4371 ENCODE_DESIGNATION (charset, reg, coding); \
4373 } while (0)
4376 /* Produce designation sequences of charsets in the line started from
4377 CHARBUF to a place pointed by DST, and return the number of
4378 produced bytes. DST should not directly point a buffer text area
4379 which may be relocated by char_charset call.
4381 If the current block ends before any end-of-line, we may fail to
4382 find all the necessary designations. */
4384 static ptrdiff_t
4385 encode_designation_at_bol (struct coding_system *coding,
4386 int *charbuf, int *charbuf_end,
4387 unsigned char *dst)
4389 unsigned char *orig = dst;
4390 struct charset *charset;
4391 /* Table of charsets to be designated to each graphic register. */
4392 int r[4];
4393 int c, found = 0, reg;
4394 ptrdiff_t produced_chars = 0;
4395 bool multibytep = coding->dst_multibyte;
4396 Lisp_Object attrs;
4397 Lisp_Object charset_list;
4399 attrs = CODING_ID_ATTRS (coding->id);
4400 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4401 if (EQ (charset_list, Qiso_2022))
4402 charset_list = Viso_2022_charset_list;
4404 for (reg = 0; reg < 4; reg++)
4405 r[reg] = -1;
4407 while (charbuf < charbuf_end && found < 4)
4409 int id;
4411 c = *charbuf++;
4412 if (c == '\n')
4413 break;
4414 charset = char_charset (c, charset_list, NULL);
4415 id = CHARSET_ID (charset);
4416 reg = CODING_ISO_REQUEST (coding, id);
4417 if (reg >= 0 && r[reg] < 0)
4419 found++;
4420 r[reg] = id;
4424 if (found)
4426 for (reg = 0; reg < 4; reg++)
4427 if (r[reg] >= 0
4428 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4429 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4432 return dst - orig;
4435 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4437 static bool
4438 encode_coding_iso_2022 (struct coding_system *coding)
4440 bool multibytep = coding->dst_multibyte;
4441 int *charbuf = coding->charbuf;
4442 int *charbuf_end = charbuf + coding->charbuf_used;
4443 unsigned char *dst = coding->destination + coding->produced;
4444 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4445 int safe_room = 16;
4446 bool bol_designation
4447 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4448 && CODING_ISO_BOL (coding));
4449 ptrdiff_t produced_chars = 0;
4450 Lisp_Object attrs, eol_type, charset_list;
4451 bool ascii_compatible;
4452 int c;
4453 int preferred_charset_id = -1;
4455 CODING_GET_INFO (coding, attrs, charset_list);
4456 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4457 if (VECTORP (eol_type))
4458 eol_type = Qunix;
4460 setup_iso_safe_charsets (attrs);
4461 /* Charset list may have been changed. */
4462 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4463 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4465 ascii_compatible
4466 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4467 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4468 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4470 while (charbuf < charbuf_end)
4472 ASSURE_DESTINATION (safe_room);
4474 if (bol_designation)
4476 /* We have to produce designation sequences if any now. */
4477 unsigned char desig_buf[16];
4478 ptrdiff_t nbytes;
4479 ptrdiff_t offset;
4481 charset_map_loaded = 0;
4482 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4483 desig_buf);
4484 if (charset_map_loaded
4485 && (offset = coding_change_destination (coding)))
4487 dst += offset;
4488 dst_end += offset;
4490 memcpy (dst, desig_buf, nbytes);
4491 dst += nbytes;
4492 /* We are sure that designation sequences are all ASCII bytes. */
4493 produced_chars += nbytes;
4494 bol_designation = 0;
4495 ASSURE_DESTINATION (safe_room);
4498 c = *charbuf++;
4500 if (c < 0)
4502 /* Handle an annotation. */
4503 switch (*charbuf)
4505 case CODING_ANNOTATE_COMPOSITION_MASK:
4506 /* Not yet implemented. */
4507 break;
4508 case CODING_ANNOTATE_CHARSET_MASK:
4509 preferred_charset_id = charbuf[2];
4510 if (preferred_charset_id >= 0
4511 && NILP (Fmemq (make_number (preferred_charset_id),
4512 charset_list)))
4513 preferred_charset_id = -1;
4514 break;
4515 default:
4516 emacs_abort ();
4518 charbuf += -c - 1;
4519 continue;
4522 /* Now encode the character C. */
4523 if (c < 0x20 || c == 0x7F)
4525 if (c == '\n'
4526 || (c == '\r' && EQ (eol_type, Qmac)))
4528 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4529 ENCODE_RESET_PLANE_AND_REGISTER ();
4530 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4532 int i;
4534 for (i = 0; i < 4; i++)
4535 CODING_ISO_DESIGNATION (coding, i)
4536 = CODING_ISO_INITIAL (coding, i);
4538 bol_designation = ((CODING_ISO_FLAGS (coding)
4539 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4540 != 0);
4542 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4543 ENCODE_RESET_PLANE_AND_REGISTER ();
4544 EMIT_ONE_ASCII_BYTE (c);
4546 else if (ASCII_CHAR_P (c))
4548 if (ascii_compatible)
4549 EMIT_ONE_ASCII_BYTE (c);
4550 else
4552 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4553 ENCODE_ISO_CHARACTER (charset, c);
4556 else if (CHAR_BYTE8_P (c))
4558 c = CHAR_TO_BYTE8 (c);
4559 EMIT_ONE_BYTE (c);
4561 else
4563 struct charset *charset;
4565 if (preferred_charset_id >= 0)
4567 bool result;
4569 charset = CHARSET_FROM_ID (preferred_charset_id);
4570 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4571 if (! result)
4572 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4573 NULL, charset);
4575 else
4576 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4577 NULL, charset);
4578 if (!charset)
4580 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4582 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4583 charset = CHARSET_FROM_ID (charset_ascii);
4585 else
4587 c = coding->default_char;
4588 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4589 charset_list, NULL, charset);
4592 ENCODE_ISO_CHARACTER (charset, c);
4596 if (coding->mode & CODING_MODE_LAST_BLOCK
4597 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4599 ASSURE_DESTINATION (safe_room);
4600 ENCODE_RESET_PLANE_AND_REGISTER ();
4602 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4603 CODING_ISO_BOL (coding) = bol_designation;
4604 coding->produced_char += produced_chars;
4605 coding->produced = dst - coding->destination;
4606 return 0;
4610 /*** 8,9. SJIS and BIG5 handlers ***/
4612 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4613 quite widely. So, for the moment, Emacs supports them in the bare
4614 C code. But, in the future, they may be supported only by CCL. */
4616 /* SJIS is a coding system encoding three character sets: ASCII, right
4617 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4618 as is. A character of charset katakana-jisx0201 is encoded by
4619 "position-code + 0x80". A character of charset japanese-jisx0208
4620 is encoded in 2-byte but two position-codes are divided and shifted
4621 so that it fit in the range below.
4623 --- CODE RANGE of SJIS ---
4624 (character set) (range)
4625 ASCII 0x00 .. 0x7F
4626 KATAKANA-JISX0201 0xA0 .. 0xDF
4627 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4628 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4629 -------------------------------
4633 /* BIG5 is a coding system encoding two character sets: ASCII and
4634 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4635 character set and is encoded in two-byte.
4637 --- CODE RANGE of BIG5 ---
4638 (character set) (range)
4639 ASCII 0x00 .. 0x7F
4640 Big5 (1st byte) 0xA1 .. 0xFE
4641 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4642 --------------------------
4646 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4647 Return true if a text is encoded in SJIS. */
4649 static bool
4650 detect_coding_sjis (struct coding_system *coding,
4651 struct coding_detection_info *detect_info)
4653 const unsigned char *src = coding->source, *src_base;
4654 const unsigned char *src_end = coding->source + coding->src_bytes;
4655 bool multibytep = coding->src_multibyte;
4656 ptrdiff_t consumed_chars = 0;
4657 int found = 0;
4658 int c;
4659 Lisp_Object attrs, charset_list;
4660 int max_first_byte_of_2_byte_code;
4662 CODING_GET_INFO (coding, attrs, charset_list);
4663 max_first_byte_of_2_byte_code
4664 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4666 detect_info->checked |= CATEGORY_MASK_SJIS;
4667 /* A coding system of this category is always ASCII compatible. */
4668 src += coding->head_ascii;
4670 while (1)
4672 src_base = src;
4673 ONE_MORE_BYTE (c);
4674 if (c < 0x80)
4675 continue;
4676 if ((c >= 0x81 && c <= 0x9F)
4677 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4679 ONE_MORE_BYTE (c);
4680 if (c < 0x40 || c == 0x7F || c > 0xFC)
4681 break;
4682 found = CATEGORY_MASK_SJIS;
4684 else if (c >= 0xA0 && c < 0xE0)
4685 found = CATEGORY_MASK_SJIS;
4686 else
4687 break;
4689 detect_info->rejected |= CATEGORY_MASK_SJIS;
4690 return 0;
4692 no_more_source:
4693 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4695 detect_info->rejected |= CATEGORY_MASK_SJIS;
4696 return 0;
4698 detect_info->found |= found;
4699 return 1;
4702 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4703 Return true if a text is encoded in BIG5. */
4705 static bool
4706 detect_coding_big5 (struct coding_system *coding,
4707 struct coding_detection_info *detect_info)
4709 const unsigned char *src = coding->source, *src_base;
4710 const unsigned char *src_end = coding->source + coding->src_bytes;
4711 bool multibytep = coding->src_multibyte;
4712 ptrdiff_t consumed_chars = 0;
4713 int found = 0;
4714 int c;
4716 detect_info->checked |= CATEGORY_MASK_BIG5;
4717 /* A coding system of this category is always ASCII compatible. */
4718 src += coding->head_ascii;
4720 while (1)
4722 src_base = src;
4723 ONE_MORE_BYTE (c);
4724 if (c < 0x80)
4725 continue;
4726 if (c >= 0xA1)
4728 ONE_MORE_BYTE (c);
4729 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4730 return 0;
4731 found = CATEGORY_MASK_BIG5;
4733 else
4734 break;
4736 detect_info->rejected |= CATEGORY_MASK_BIG5;
4737 return 0;
4739 no_more_source:
4740 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742 detect_info->rejected |= CATEGORY_MASK_BIG5;
4743 return 0;
4745 detect_info->found |= found;
4746 return 1;
4749 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4751 static void
4752 decode_coding_sjis (struct coding_system *coding)
4754 const unsigned char *src = coding->source + coding->consumed;
4755 const unsigned char *src_end = coding->source + coding->src_bytes;
4756 const unsigned char *src_base;
4757 int *charbuf = coding->charbuf + coding->charbuf_used;
4758 /* We may produce one charset annotation in one loop and one more at
4759 the end. */
4760 int *charbuf_end
4761 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4762 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4763 bool multibytep = coding->src_multibyte;
4764 struct charset *charset_roman, *charset_kanji, *charset_kana;
4765 struct charset *charset_kanji2;
4766 Lisp_Object attrs, charset_list, val;
4767 ptrdiff_t char_offset = coding->produced_char;
4768 ptrdiff_t last_offset = char_offset;
4769 int last_id = charset_ascii;
4770 bool eol_dos
4771 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4772 int byte_after_cr = -1;
4774 CODING_GET_INFO (coding, attrs, charset_list);
4776 val = charset_list;
4777 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4778 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4779 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4780 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4782 while (1)
4784 int c, c1;
4785 struct charset *charset;
4787 src_base = src;
4788 consumed_chars_base = consumed_chars;
4790 if (charbuf >= charbuf_end)
4792 if (byte_after_cr >= 0)
4793 src_base--;
4794 break;
4797 if (byte_after_cr >= 0)
4798 c = byte_after_cr, byte_after_cr = -1;
4799 else
4800 ONE_MORE_BYTE (c);
4801 if (c < 0)
4802 goto invalid_code;
4803 if (c < 0x80)
4805 if (eol_dos && c == '\r')
4806 ONE_MORE_BYTE (byte_after_cr);
4807 charset = charset_roman;
4809 else if (c == 0x80 || c == 0xA0)
4810 goto invalid_code;
4811 else if (c >= 0xA1 && c <= 0xDF)
4813 /* SJIS -> JISX0201-Kana */
4814 c &= 0x7F;
4815 charset = charset_kana;
4817 else if (c <= 0xEF)
4819 /* SJIS -> JISX0208 */
4820 ONE_MORE_BYTE (c1);
4821 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4822 goto invalid_code;
4823 c = (c << 8) | c1;
4824 SJIS_TO_JIS (c);
4825 charset = charset_kanji;
4827 else if (c <= 0xFC && charset_kanji2)
4829 /* SJIS -> JISX0213-2 */
4830 ONE_MORE_BYTE (c1);
4831 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4832 goto invalid_code;
4833 c = (c << 8) | c1;
4834 SJIS_TO_JIS2 (c);
4835 charset = charset_kanji2;
4837 else
4838 goto invalid_code;
4839 if (charset->id != charset_ascii
4840 && last_id != charset->id)
4842 if (last_id != charset_ascii)
4843 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4844 last_id = charset->id;
4845 last_offset = char_offset;
4847 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4848 *charbuf++ = c;
4849 char_offset++;
4850 continue;
4852 invalid_code:
4853 src = src_base;
4854 consumed_chars = consumed_chars_base;
4855 ONE_MORE_BYTE (c);
4856 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4857 char_offset++;
4858 coding->errors++;
4861 no_more_source:
4862 if (last_id != charset_ascii)
4863 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4864 coding->consumed_char += consumed_chars_base;
4865 coding->consumed = src_base - coding->source;
4866 coding->charbuf_used = charbuf - coding->charbuf;
4869 static void
4870 decode_coding_big5 (struct coding_system *coding)
4872 const unsigned char *src = coding->source + coding->consumed;
4873 const unsigned char *src_end = coding->source + coding->src_bytes;
4874 const unsigned char *src_base;
4875 int *charbuf = coding->charbuf + coding->charbuf_used;
4876 /* We may produce one charset annotation in one loop and one more at
4877 the end. */
4878 int *charbuf_end
4879 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4880 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4881 bool multibytep = coding->src_multibyte;
4882 struct charset *charset_roman, *charset_big5;
4883 Lisp_Object attrs, charset_list, val;
4884 ptrdiff_t char_offset = coding->produced_char;
4885 ptrdiff_t last_offset = char_offset;
4886 int last_id = charset_ascii;
4887 bool eol_dos
4888 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4889 int byte_after_cr = -1;
4891 CODING_GET_INFO (coding, attrs, charset_list);
4892 val = charset_list;
4893 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4894 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4896 while (1)
4898 int c, c1;
4899 struct charset *charset;
4901 src_base = src;
4902 consumed_chars_base = consumed_chars;
4904 if (charbuf >= charbuf_end)
4906 if (byte_after_cr >= 0)
4907 src_base--;
4908 break;
4911 if (byte_after_cr >= 0)
4912 c = byte_after_cr, byte_after_cr = -1;
4913 else
4914 ONE_MORE_BYTE (c);
4916 if (c < 0)
4917 goto invalid_code;
4918 if (c < 0x80)
4920 if (eol_dos && c == '\r')
4921 ONE_MORE_BYTE (byte_after_cr);
4922 charset = charset_roman;
4924 else
4926 /* BIG5 -> Big5 */
4927 if (c < 0xA1 || c > 0xFE)
4928 goto invalid_code;
4929 ONE_MORE_BYTE (c1);
4930 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4931 goto invalid_code;
4932 c = c << 8 | c1;
4933 charset = charset_big5;
4935 if (charset->id != charset_ascii
4936 && last_id != charset->id)
4938 if (last_id != charset_ascii)
4939 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4940 last_id = charset->id;
4941 last_offset = char_offset;
4943 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4944 *charbuf++ = c;
4945 char_offset++;
4946 continue;
4948 invalid_code:
4949 src = src_base;
4950 consumed_chars = consumed_chars_base;
4951 ONE_MORE_BYTE (c);
4952 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4953 char_offset++;
4954 coding->errors++;
4957 no_more_source:
4958 if (last_id != charset_ascii)
4959 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4960 coding->consumed_char += consumed_chars_base;
4961 coding->consumed = src_base - coding->source;
4962 coding->charbuf_used = charbuf - coding->charbuf;
4965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4966 This function can encode charsets `ascii', `katakana-jisx0201',
4967 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4968 are sure that all these charsets are registered as official charset
4969 (i.e. do not have extended leading-codes). Characters of other
4970 charsets are produced without any encoding. */
4972 static bool
4973 encode_coding_sjis (struct coding_system *coding)
4975 bool multibytep = coding->dst_multibyte;
4976 int *charbuf = coding->charbuf;
4977 int *charbuf_end = charbuf + coding->charbuf_used;
4978 unsigned char *dst = coding->destination + coding->produced;
4979 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4980 int safe_room = 4;
4981 ptrdiff_t produced_chars = 0;
4982 Lisp_Object attrs, charset_list, val;
4983 bool ascii_compatible;
4984 struct charset *charset_kanji, *charset_kana;
4985 struct charset *charset_kanji2;
4986 int c;
4988 CODING_GET_INFO (coding, attrs, charset_list);
4989 val = XCDR (charset_list);
4990 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4991 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4992 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4994 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4996 while (charbuf < charbuf_end)
4998 ASSURE_DESTINATION (safe_room);
4999 c = *charbuf++;
5000 /* Now encode the character C. */
5001 if (ASCII_CHAR_P (c) && ascii_compatible)
5002 EMIT_ONE_ASCII_BYTE (c);
5003 else if (CHAR_BYTE8_P (c))
5005 c = CHAR_TO_BYTE8 (c);
5006 EMIT_ONE_BYTE (c);
5008 else
5010 unsigned code;
5011 struct charset *charset;
5012 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5013 &code, charset);
5015 if (!charset)
5017 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5019 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5020 charset = CHARSET_FROM_ID (charset_ascii);
5022 else
5024 c = coding->default_char;
5025 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5026 charset_list, &code, charset);
5029 if (code == CHARSET_INVALID_CODE (charset))
5030 emacs_abort ();
5031 if (charset == charset_kanji)
5033 int c1, c2;
5034 JIS_TO_SJIS (code);
5035 c1 = code >> 8, c2 = code & 0xFF;
5036 EMIT_TWO_BYTES (c1, c2);
5038 else if (charset == charset_kana)
5039 EMIT_ONE_BYTE (code | 0x80);
5040 else if (charset_kanji2 && charset == charset_kanji2)
5042 int c1, c2;
5044 c1 = code >> 8;
5045 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5046 || c1 == 0x28
5047 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5049 JIS_TO_SJIS2 (code);
5050 c1 = code >> 8, c2 = code & 0xFF;
5051 EMIT_TWO_BYTES (c1, c2);
5053 else
5054 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5056 else
5057 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5060 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5061 coding->produced_char += produced_chars;
5062 coding->produced = dst - coding->destination;
5063 return 0;
5066 static bool
5067 encode_coding_big5 (struct coding_system *coding)
5069 bool multibytep = coding->dst_multibyte;
5070 int *charbuf = coding->charbuf;
5071 int *charbuf_end = charbuf + coding->charbuf_used;
5072 unsigned char *dst = coding->destination + coding->produced;
5073 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5074 int safe_room = 4;
5075 ptrdiff_t produced_chars = 0;
5076 Lisp_Object attrs, charset_list, val;
5077 bool ascii_compatible;
5078 struct charset *charset_big5;
5079 int c;
5081 CODING_GET_INFO (coding, attrs, charset_list);
5082 val = XCDR (charset_list);
5083 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5084 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5086 while (charbuf < charbuf_end)
5088 ASSURE_DESTINATION (safe_room);
5089 c = *charbuf++;
5090 /* Now encode the character C. */
5091 if (ASCII_CHAR_P (c) && ascii_compatible)
5092 EMIT_ONE_ASCII_BYTE (c);
5093 else if (CHAR_BYTE8_P (c))
5095 c = CHAR_TO_BYTE8 (c);
5096 EMIT_ONE_BYTE (c);
5098 else
5100 unsigned code;
5101 struct charset *charset;
5102 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5103 &code, charset);
5105 if (! charset)
5107 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5109 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5110 charset = CHARSET_FROM_ID (charset_ascii);
5112 else
5114 c = coding->default_char;
5115 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5116 charset_list, &code, charset);
5119 if (code == CHARSET_INVALID_CODE (charset))
5120 emacs_abort ();
5121 if (charset == charset_big5)
5123 int c1, c2;
5125 c1 = code >> 8, c2 = code & 0xFF;
5126 EMIT_TWO_BYTES (c1, c2);
5128 else
5129 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5132 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5133 coding->produced_char += produced_chars;
5134 coding->produced = dst - coding->destination;
5135 return 0;
5139 /*** 10. CCL handlers ***/
5141 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5142 Return true if a text is encoded in a coding system of which
5143 encoder/decoder are written in CCL program. */
5145 static bool
5146 detect_coding_ccl (struct coding_system *coding,
5147 struct coding_detection_info *detect_info)
5149 const unsigned char *src = coding->source, *src_base;
5150 const unsigned char *src_end = coding->source + coding->src_bytes;
5151 bool multibytep = coding->src_multibyte;
5152 ptrdiff_t consumed_chars = 0;
5153 int found = 0;
5154 unsigned char *valids;
5155 ptrdiff_t head_ascii = coding->head_ascii;
5156 Lisp_Object attrs;
5158 detect_info->checked |= CATEGORY_MASK_CCL;
5160 coding = &coding_categories[coding_category_ccl];
5161 valids = CODING_CCL_VALIDS (coding);
5162 attrs = CODING_ID_ATTRS (coding->id);
5163 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5164 src += head_ascii;
5166 while (1)
5168 int c;
5170 src_base = src;
5171 ONE_MORE_BYTE (c);
5172 if (c < 0 || ! valids[c])
5173 break;
5174 if ((valids[c] > 1))
5175 found = CATEGORY_MASK_CCL;
5177 detect_info->rejected |= CATEGORY_MASK_CCL;
5178 return 0;
5180 no_more_source:
5181 detect_info->found |= found;
5182 return 1;
5185 static void
5186 decode_coding_ccl (struct coding_system *coding)
5188 const unsigned char *src = coding->source + coding->consumed;
5189 const unsigned char *src_end = coding->source + coding->src_bytes;
5190 int *charbuf = coding->charbuf + coding->charbuf_used;
5191 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5192 ptrdiff_t consumed_chars = 0;
5193 bool multibytep = coding->src_multibyte;
5194 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5195 int source_charbuf[1024];
5196 int source_byteidx[1025];
5197 Lisp_Object attrs, charset_list;
5199 CODING_GET_INFO (coding, attrs, charset_list);
5201 while (1)
5203 const unsigned char *p = src;
5204 ptrdiff_t offset;
5205 int i = 0;
5207 if (multibytep)
5209 while (i < 1024 && p < src_end)
5211 source_byteidx[i] = p - src;
5212 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5214 source_byteidx[i] = p - src;
5216 else
5217 while (i < 1024 && p < src_end)
5218 source_charbuf[i++] = *p++;
5220 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5221 ccl->last_block = true;
5222 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5223 charset_map_loaded = 0;
5224 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5225 charset_list);
5226 if (charset_map_loaded
5227 && (offset = coding_change_source (coding)))
5229 p += offset;
5230 src += offset;
5231 src_end += offset;
5233 charbuf += ccl->produced;
5234 if (multibytep)
5235 src += source_byteidx[ccl->consumed];
5236 else
5237 src += ccl->consumed;
5238 consumed_chars += ccl->consumed;
5239 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5240 break;
5243 switch (ccl->status)
5245 case CCL_STAT_SUSPEND_BY_SRC:
5246 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5247 break;
5248 case CCL_STAT_SUSPEND_BY_DST:
5249 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5250 break;
5251 case CCL_STAT_QUIT:
5252 case CCL_STAT_INVALID_CMD:
5253 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5254 break;
5255 default:
5256 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5257 break;
5259 coding->consumed_char += consumed_chars;
5260 coding->consumed = src - coding->source;
5261 coding->charbuf_used = charbuf - coding->charbuf;
5264 static bool
5265 encode_coding_ccl (struct coding_system *coding)
5267 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5268 bool multibytep = coding->dst_multibyte;
5269 int *charbuf = coding->charbuf;
5270 int *charbuf_end = charbuf + coding->charbuf_used;
5271 unsigned char *dst = coding->destination + coding->produced;
5272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5273 int destination_charbuf[1024];
5274 ptrdiff_t produced_chars = 0;
5275 int i;
5276 Lisp_Object attrs, charset_list;
5278 CODING_GET_INFO (coding, attrs, charset_list);
5279 if (coding->consumed_char == coding->src_chars
5280 && coding->mode & CODING_MODE_LAST_BLOCK)
5281 ccl->last_block = true;
5285 ptrdiff_t offset;
5287 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5288 charset_map_loaded = 0;
5289 ccl_driver (ccl, charbuf, destination_charbuf,
5290 charbuf_end - charbuf, 1024, charset_list);
5291 if (charset_map_loaded
5292 && (offset = coding_change_destination (coding)))
5293 dst += offset;
5294 if (multibytep)
5296 ASSURE_DESTINATION (ccl->produced * 2);
5297 for (i = 0; i < ccl->produced; i++)
5298 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5300 else
5302 ASSURE_DESTINATION (ccl->produced);
5303 for (i = 0; i < ccl->produced; i++)
5304 *dst++ = destination_charbuf[i] & 0xFF;
5305 produced_chars += ccl->produced;
5307 charbuf += ccl->consumed;
5308 if (ccl->status == CCL_STAT_QUIT
5309 || ccl->status == CCL_STAT_INVALID_CMD)
5310 break;
5312 while (charbuf < charbuf_end);
5314 switch (ccl->status)
5316 case CCL_STAT_SUSPEND_BY_SRC:
5317 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5318 break;
5319 case CCL_STAT_SUSPEND_BY_DST:
5320 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5321 break;
5322 case CCL_STAT_QUIT:
5323 case CCL_STAT_INVALID_CMD:
5324 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5325 break;
5326 default:
5327 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5328 break;
5331 coding->produced_char += produced_chars;
5332 coding->produced = dst - coding->destination;
5333 return 0;
5337 /*** 10, 11. no-conversion handlers ***/
5339 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5341 static void
5342 decode_coding_raw_text (struct coding_system *coding)
5344 bool eol_dos
5345 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5347 coding->chars_at_source = 1;
5348 coding->consumed_char = coding->src_chars;
5349 coding->consumed = coding->src_bytes;
5350 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5352 coding->consumed_char--;
5353 coding->consumed--;
5354 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5356 else
5357 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5360 static bool
5361 encode_coding_raw_text (struct coding_system *coding)
5363 bool multibytep = coding->dst_multibyte;
5364 int *charbuf = coding->charbuf;
5365 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5366 unsigned char *dst = coding->destination + coding->produced;
5367 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5368 ptrdiff_t produced_chars = 0;
5369 int c;
5371 if (multibytep)
5373 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5375 if (coding->src_multibyte)
5376 while (charbuf < charbuf_end)
5378 ASSURE_DESTINATION (safe_room);
5379 c = *charbuf++;
5380 if (ASCII_CHAR_P (c))
5381 EMIT_ONE_ASCII_BYTE (c);
5382 else if (CHAR_BYTE8_P (c))
5384 c = CHAR_TO_BYTE8 (c);
5385 EMIT_ONE_BYTE (c);
5387 else
5389 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5391 CHAR_STRING_ADVANCE (c, p1);
5394 EMIT_ONE_BYTE (*p0);
5395 p0++;
5397 while (p0 < p1);
5400 else
5401 while (charbuf < charbuf_end)
5403 ASSURE_DESTINATION (safe_room);
5404 c = *charbuf++;
5405 EMIT_ONE_BYTE (c);
5408 else
5410 if (coding->src_multibyte)
5412 int safe_room = MAX_MULTIBYTE_LENGTH;
5414 while (charbuf < charbuf_end)
5416 ASSURE_DESTINATION (safe_room);
5417 c = *charbuf++;
5418 if (ASCII_CHAR_P (c))
5419 *dst++ = c;
5420 else if (CHAR_BYTE8_P (c))
5421 *dst++ = CHAR_TO_BYTE8 (c);
5422 else
5423 CHAR_STRING_ADVANCE (c, dst);
5426 else
5428 ASSURE_DESTINATION (charbuf_end - charbuf);
5429 while (charbuf < charbuf_end && dst < dst_end)
5430 *dst++ = *charbuf++;
5432 produced_chars = dst - (coding->destination + coding->produced);
5434 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5435 coding->produced_char += produced_chars;
5436 coding->produced = dst - coding->destination;
5437 return 0;
5440 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5441 Return true if a text is encoded in a charset-based coding system. */
5443 static bool
5444 detect_coding_charset (struct coding_system *coding,
5445 struct coding_detection_info *detect_info)
5447 const unsigned char *src = coding->source, *src_base;
5448 const unsigned char *src_end = coding->source + coding->src_bytes;
5449 bool multibytep = coding->src_multibyte;
5450 ptrdiff_t consumed_chars = 0;
5451 Lisp_Object attrs, valids, name;
5452 int found = 0;
5453 ptrdiff_t head_ascii = coding->head_ascii;
5454 bool check_latin_extra = 0;
5456 detect_info->checked |= CATEGORY_MASK_CHARSET;
5458 coding = &coding_categories[coding_category_charset];
5459 attrs = CODING_ID_ATTRS (coding->id);
5460 valids = AREF (attrs, coding_attr_charset_valids);
5461 name = CODING_ID_NAME (coding->id);
5462 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5463 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5464 || strncmp (SSDATA (SYMBOL_NAME (name)),
5465 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5466 check_latin_extra = 1;
5468 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5469 src += head_ascii;
5471 while (1)
5473 int c;
5474 Lisp_Object val;
5475 struct charset *charset;
5476 int dim, idx;
5478 src_base = src;
5479 ONE_MORE_BYTE (c);
5480 if (c < 0)
5481 continue;
5482 val = AREF (valids, c);
5483 if (NILP (val))
5484 break;
5485 if (c >= 0x80)
5487 if (c < 0xA0
5488 && check_latin_extra
5489 && (!VECTORP (Vlatin_extra_code_table)
5490 || NILP (AREF (Vlatin_extra_code_table, c))))
5491 break;
5492 found = CATEGORY_MASK_CHARSET;
5494 if (INTEGERP (val))
5496 charset = CHARSET_FROM_ID (XFASTINT (val));
5497 dim = CHARSET_DIMENSION (charset);
5498 for (idx = 1; idx < dim; idx++)
5500 if (src == src_end)
5501 goto too_short;
5502 ONE_MORE_BYTE (c);
5503 if (c < charset->code_space[(dim - 1 - idx) * 4]
5504 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5505 break;
5507 if (idx < dim)
5508 break;
5510 else
5512 idx = 1;
5513 for (; CONSP (val); val = XCDR (val))
5515 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5516 dim = CHARSET_DIMENSION (charset);
5517 while (idx < dim)
5519 if (src == src_end)
5520 goto too_short;
5521 ONE_MORE_BYTE (c);
5522 if (c < charset->code_space[(dim - 1 - idx) * 4]
5523 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5524 break;
5525 idx++;
5527 if (idx == dim)
5529 val = Qnil;
5530 break;
5533 if (CONSP (val))
5534 break;
5537 too_short:
5538 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5539 return 0;
5541 no_more_source:
5542 detect_info->found |= found;
5543 return 1;
5546 static void
5547 decode_coding_charset (struct coding_system *coding)
5549 const unsigned char *src = coding->source + coding->consumed;
5550 const unsigned char *src_end = coding->source + coding->src_bytes;
5551 const unsigned char *src_base;
5552 int *charbuf = coding->charbuf + coding->charbuf_used;
5553 /* We may produce one charset annotation in one loop and one more at
5554 the end. */
5555 int *charbuf_end
5556 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5557 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5558 bool multibytep = coding->src_multibyte;
5559 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5560 Lisp_Object valids;
5561 ptrdiff_t char_offset = coding->produced_char;
5562 ptrdiff_t last_offset = char_offset;
5563 int last_id = charset_ascii;
5564 bool eol_dos
5565 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5566 int byte_after_cr = -1;
5568 valids = AREF (attrs, coding_attr_charset_valids);
5570 while (1)
5572 int c;
5573 Lisp_Object val;
5574 struct charset *charset;
5575 int dim;
5576 int len = 1;
5577 unsigned code;
5579 src_base = src;
5580 consumed_chars_base = consumed_chars;
5582 if (charbuf >= charbuf_end)
5584 if (byte_after_cr >= 0)
5585 src_base--;
5586 break;
5589 if (byte_after_cr >= 0)
5591 c = byte_after_cr;
5592 byte_after_cr = -1;
5594 else
5596 ONE_MORE_BYTE (c);
5597 if (eol_dos && c == '\r')
5598 ONE_MORE_BYTE (byte_after_cr);
5600 if (c < 0)
5601 goto invalid_code;
5602 code = c;
5604 val = AREF (valids, c);
5605 if (! INTEGERP (val) && ! CONSP (val))
5606 goto invalid_code;
5607 if (INTEGERP (val))
5609 charset = CHARSET_FROM_ID (XFASTINT (val));
5610 dim = CHARSET_DIMENSION (charset);
5611 while (len < dim)
5613 ONE_MORE_BYTE (c);
5614 code = (code << 8) | c;
5615 len++;
5617 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5618 charset, code, c);
5620 else
5622 /* VAL is a list of charset IDs. It is assured that the
5623 list is sorted by charset dimensions (smaller one
5624 comes first). */
5625 while (CONSP (val))
5627 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5628 dim = CHARSET_DIMENSION (charset);
5629 while (len < dim)
5631 ONE_MORE_BYTE (c);
5632 code = (code << 8) | c;
5633 len++;
5635 CODING_DECODE_CHAR (coding, src, src_base,
5636 src_end, charset, code, c);
5637 if (c >= 0)
5638 break;
5639 val = XCDR (val);
5642 if (c < 0)
5643 goto invalid_code;
5644 if (charset->id != charset_ascii
5645 && last_id != charset->id)
5647 if (last_id != charset_ascii)
5648 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5649 last_id = charset->id;
5650 last_offset = char_offset;
5653 *charbuf++ = c;
5654 char_offset++;
5655 continue;
5657 invalid_code:
5658 src = src_base;
5659 consumed_chars = consumed_chars_base;
5660 ONE_MORE_BYTE (c);
5661 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5662 char_offset++;
5663 coding->errors++;
5666 no_more_source:
5667 if (last_id != charset_ascii)
5668 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5669 coding->consumed_char += consumed_chars_base;
5670 coding->consumed = src_base - coding->source;
5671 coding->charbuf_used = charbuf - coding->charbuf;
5674 static bool
5675 encode_coding_charset (struct coding_system *coding)
5677 bool multibytep = coding->dst_multibyte;
5678 int *charbuf = coding->charbuf;
5679 int *charbuf_end = charbuf + coding->charbuf_used;
5680 unsigned char *dst = coding->destination + coding->produced;
5681 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5682 int safe_room = MAX_MULTIBYTE_LENGTH;
5683 ptrdiff_t produced_chars = 0;
5684 Lisp_Object attrs, charset_list;
5685 bool ascii_compatible;
5686 int c;
5688 CODING_GET_INFO (coding, attrs, charset_list);
5689 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5691 while (charbuf < charbuf_end)
5693 struct charset *charset;
5694 unsigned code;
5696 ASSURE_DESTINATION (safe_room);
5697 c = *charbuf++;
5698 if (ascii_compatible && ASCII_CHAR_P (c))
5699 EMIT_ONE_ASCII_BYTE (c);
5700 else if (CHAR_BYTE8_P (c))
5702 c = CHAR_TO_BYTE8 (c);
5703 EMIT_ONE_BYTE (c);
5705 else
5707 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5708 &code, charset);
5710 if (charset)
5712 if (CHARSET_DIMENSION (charset) == 1)
5713 EMIT_ONE_BYTE (code);
5714 else if (CHARSET_DIMENSION (charset) == 2)
5715 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5716 else if (CHARSET_DIMENSION (charset) == 3)
5717 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5718 else
5719 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5720 (code >> 8) & 0xFF, code & 0xFF);
5722 else
5724 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5725 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5726 else
5727 c = coding->default_char;
5728 EMIT_ONE_BYTE (c);
5733 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5734 coding->produced_char += produced_chars;
5735 coding->produced = dst - coding->destination;
5736 return 0;
5740 /*** 7. C library functions ***/
5742 /* Setup coding context CODING from information about CODING_SYSTEM.
5743 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5744 CODING_SYSTEM is invalid, signal an error. */
5746 void
5747 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5749 Lisp_Object attrs;
5750 Lisp_Object eol_type;
5751 Lisp_Object coding_type;
5752 Lisp_Object val;
5754 if (NILP (coding_system))
5755 coding_system = Qundecided;
5757 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5759 attrs = CODING_ID_ATTRS (coding->id);
5760 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5762 coding->mode = 0;
5763 if (VECTORP (eol_type))
5764 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5765 | CODING_REQUIRE_DETECTION_MASK);
5766 else if (! EQ (eol_type, Qunix))
5767 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5768 | CODING_REQUIRE_ENCODING_MASK);
5769 else
5770 coding->common_flags = 0;
5771 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5772 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5773 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5774 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5775 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5776 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5778 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5779 coding->max_charset_id = SCHARS (val) - 1;
5780 coding->safe_charsets = SDATA (val);
5781 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5782 coding->carryover_bytes = 0;
5783 coding->raw_destination = 0;
5785 coding_type = CODING_ATTR_TYPE (attrs);
5786 if (EQ (coding_type, Qundecided))
5788 coding->detector = NULL;
5789 coding->decoder = decode_coding_raw_text;
5790 coding->encoder = encode_coding_raw_text;
5791 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5792 coding->spec.undecided.inhibit_nbd
5793 = (encode_inhibit_flag
5794 (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5795 coding->spec.undecided.inhibit_ied
5796 = (encode_inhibit_flag
5797 (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5798 coding->spec.undecided.prefer_utf_8
5799 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5801 else if (EQ (coding_type, Qiso_2022))
5803 int i;
5804 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5806 /* Invoke graphic register 0 to plane 0. */
5807 CODING_ISO_INVOCATION (coding, 0) = 0;
5808 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5809 CODING_ISO_INVOCATION (coding, 1)
5810 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5811 /* Setup the initial status of designation. */
5812 for (i = 0; i < 4; i++)
5813 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5814 /* Not single shifting initially. */
5815 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5816 /* Beginning of buffer should also be regarded as bol. */
5817 CODING_ISO_BOL (coding) = 1;
5818 coding->detector = detect_coding_iso_2022;
5819 coding->decoder = decode_coding_iso_2022;
5820 coding->encoder = encode_coding_iso_2022;
5821 if (flags & CODING_ISO_FLAG_SAFE)
5822 coding->mode |= CODING_MODE_SAFE_ENCODING;
5823 coding->common_flags
5824 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5825 | CODING_REQUIRE_FLUSHING_MASK);
5826 if (flags & CODING_ISO_FLAG_COMPOSITION)
5827 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5828 if (flags & CODING_ISO_FLAG_DESIGNATION)
5829 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5830 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5832 setup_iso_safe_charsets (attrs);
5833 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5834 coding->max_charset_id = SCHARS (val) - 1;
5835 coding->safe_charsets = SDATA (val);
5837 CODING_ISO_FLAGS (coding) = flags;
5838 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5839 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5840 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5841 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5843 else if (EQ (coding_type, Qcharset))
5845 coding->detector = detect_coding_charset;
5846 coding->decoder = decode_coding_charset;
5847 coding->encoder = encode_coding_charset;
5848 coding->common_flags
5849 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851 else if (EQ (coding_type, Qutf_8))
5853 val = AREF (attrs, coding_attr_utf_bom);
5854 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5855 : EQ (val, Qt) ? utf_with_bom
5856 : utf_without_bom);
5857 coding->detector = detect_coding_utf_8;
5858 coding->decoder = decode_coding_utf_8;
5859 coding->encoder = encode_coding_utf_8;
5860 coding->common_flags
5861 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5863 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5865 else if (EQ (coding_type, Qutf_16))
5867 val = AREF (attrs, coding_attr_utf_bom);
5868 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5869 : EQ (val, Qt) ? utf_with_bom
5870 : utf_without_bom);
5871 val = AREF (attrs, coding_attr_utf_16_endian);
5872 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5873 : utf_16_little_endian);
5874 CODING_UTF_16_SURROGATE (coding) = 0;
5875 coding->detector = detect_coding_utf_16;
5876 coding->decoder = decode_coding_utf_16;
5877 coding->encoder = encode_coding_utf_16;
5878 coding->common_flags
5879 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5881 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5883 else if (EQ (coding_type, Qccl))
5885 coding->detector = detect_coding_ccl;
5886 coding->decoder = decode_coding_ccl;
5887 coding->encoder = encode_coding_ccl;
5888 coding->common_flags
5889 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5890 | CODING_REQUIRE_FLUSHING_MASK);
5892 else if (EQ (coding_type, Qemacs_mule))
5894 coding->detector = detect_coding_emacs_mule;
5895 coding->decoder = decode_coding_emacs_mule;
5896 coding->encoder = encode_coding_emacs_mule;
5897 coding->common_flags
5898 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5899 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5900 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5902 Lisp_Object tail, safe_charsets;
5903 int max_charset_id = 0;
5905 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5906 tail = XCDR (tail))
5907 if (max_charset_id < XFASTINT (XCAR (tail)))
5908 max_charset_id = XFASTINT (XCAR (tail));
5909 safe_charsets = make_uninit_string (max_charset_id + 1);
5910 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5911 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5912 tail = XCDR (tail))
5913 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5914 coding->max_charset_id = max_charset_id;
5915 coding->safe_charsets = SDATA (safe_charsets);
5917 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5918 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5920 else if (EQ (coding_type, Qshift_jis))
5922 coding->detector = detect_coding_sjis;
5923 coding->decoder = decode_coding_sjis;
5924 coding->encoder = encode_coding_sjis;
5925 coding->common_flags
5926 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5928 else if (EQ (coding_type, Qbig5))
5930 coding->detector = detect_coding_big5;
5931 coding->decoder = decode_coding_big5;
5932 coding->encoder = encode_coding_big5;
5933 coding->common_flags
5934 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5936 else /* EQ (coding_type, Qraw_text) */
5938 coding->detector = NULL;
5939 coding->decoder = decode_coding_raw_text;
5940 coding->encoder = encode_coding_raw_text;
5941 if (! EQ (eol_type, Qunix))
5943 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5944 if (! VECTORP (eol_type))
5945 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5950 return;
5953 /* Return a list of charsets supported by CODING. */
5955 Lisp_Object
5956 coding_charset_list (struct coding_system *coding)
5958 Lisp_Object attrs, charset_list;
5960 CODING_GET_INFO (coding, attrs, charset_list);
5961 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5963 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5965 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5966 charset_list = Viso_2022_charset_list;
5968 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5970 charset_list = Vemacs_mule_charset_list;
5972 return charset_list;
5976 /* Return a list of charsets supported by CODING-SYSTEM. */
5978 Lisp_Object
5979 coding_system_charset_list (Lisp_Object coding_system)
5981 ptrdiff_t id;
5982 Lisp_Object attrs, charset_list;
5984 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5985 attrs = CODING_ID_ATTRS (id);
5987 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5989 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5991 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5992 charset_list = Viso_2022_charset_list;
5993 else
5994 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5996 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5998 charset_list = Vemacs_mule_charset_list;
6000 else
6002 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6004 return charset_list;
6008 /* Return raw-text or one of its subsidiaries that has the same
6009 eol_type as CODING-SYSTEM. */
6011 Lisp_Object
6012 raw_text_coding_system (Lisp_Object coding_system)
6014 Lisp_Object spec, attrs;
6015 Lisp_Object eol_type, raw_text_eol_type;
6017 if (NILP (coding_system))
6018 return Qraw_text;
6019 spec = CODING_SYSTEM_SPEC (coding_system);
6020 attrs = AREF (spec, 0);
6022 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6023 return coding_system;
6025 eol_type = AREF (spec, 2);
6026 if (VECTORP (eol_type))
6027 return Qraw_text;
6028 spec = CODING_SYSTEM_SPEC (Qraw_text);
6029 raw_text_eol_type = AREF (spec, 2);
6030 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6031 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6032 : AREF (raw_text_eol_type, 2));
6036 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6037 the subsidiary that has the same eol-spec as PARENT (if it is not
6038 nil and specifies end-of-line format) or the system's setting
6039 (system_eol_type). */
6041 Lisp_Object
6042 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6044 Lisp_Object spec, eol_type;
6046 if (NILP (coding_system))
6047 coding_system = Qraw_text;
6048 spec = CODING_SYSTEM_SPEC (coding_system);
6049 eol_type = AREF (spec, 2);
6050 if (VECTORP (eol_type))
6052 Lisp_Object parent_eol_type;
6054 if (! NILP (parent))
6056 Lisp_Object parent_spec;
6058 parent_spec = CODING_SYSTEM_SPEC (parent);
6059 parent_eol_type = AREF (parent_spec, 2);
6060 if (VECTORP (parent_eol_type))
6061 parent_eol_type = system_eol_type;
6063 else
6064 parent_eol_type = system_eol_type;
6065 if (EQ (parent_eol_type, Qunix))
6066 coding_system = AREF (eol_type, 0);
6067 else if (EQ (parent_eol_type, Qdos))
6068 coding_system = AREF (eol_type, 1);
6069 else if (EQ (parent_eol_type, Qmac))
6070 coding_system = AREF (eol_type, 2);
6072 return coding_system;
6076 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6077 decided for writing to a process. If not, complement them, and
6078 return a new coding system. */
6080 Lisp_Object
6081 complement_process_encoding_system (Lisp_Object coding_system)
6083 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6084 Lisp_Object spec, attrs;
6085 int i;
6087 for (i = 0; i < 3; i++)
6089 if (i == 1)
6090 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6091 else if (i == 2)
6092 coding_system = preferred_coding_system ();
6093 spec = CODING_SYSTEM_SPEC (coding_system);
6094 if (NILP (spec))
6095 continue;
6096 attrs = AREF (spec, 0);
6097 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6098 coding_base = CODING_ATTR_BASE_NAME (attrs);
6099 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6100 eol_base = coding_system;
6101 if (! NILP (coding_base) && ! NILP (eol_base))
6102 break;
6105 if (i > 0)
6106 /* The original CODING_SYSTEM didn't specify text-conversion or
6107 eol-conversion. Be sure that we return a fully complemented
6108 coding system. */
6109 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6110 return coding_system;
6114 /* Emacs has a mechanism to automatically detect a coding system if it
6115 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6116 it's impossible to distinguish some coding systems accurately
6117 because they use the same range of codes. So, at first, coding
6118 systems are categorized into 7, those are:
6120 o coding-category-emacs-mule
6122 The category for a coding system which has the same code range
6123 as Emacs' internal format. Assigned the coding-system (Lisp
6124 symbol) `emacs-mule' by default.
6126 o coding-category-sjis
6128 The category for a coding system which has the same code range
6129 as SJIS. Assigned the coding-system (Lisp
6130 symbol) `japanese-shift-jis' by default.
6132 o coding-category-iso-7
6134 The category for a coding system which has the same code range
6135 as ISO2022 of 7-bit environment. This doesn't use any locking
6136 shift and single shift functions. This can encode/decode all
6137 charsets. Assigned the coding-system (Lisp symbol)
6138 `iso-2022-7bit' by default.
6140 o coding-category-iso-7-tight
6142 Same as coding-category-iso-7 except that this can
6143 encode/decode only the specified charsets.
6145 o coding-category-iso-8-1
6147 The category for a coding system which has the same code range
6148 as ISO2022 of 8-bit environment and graphic plane 1 used only
6149 for DIMENSION1 charset. This doesn't use any locking shift
6150 and single shift functions. Assigned the coding-system (Lisp
6151 symbol) `iso-latin-1' by default.
6153 o coding-category-iso-8-2
6155 The category for a coding system which has the same code range
6156 as ISO2022 of 8-bit environment and graphic plane 1 used only
6157 for DIMENSION2 charset. This doesn't use any locking shift
6158 and single shift functions. Assigned the coding-system (Lisp
6159 symbol) `japanese-iso-8bit' by default.
6161 o coding-category-iso-7-else
6163 The category for a coding system which has the same code range
6164 as ISO2022 of 7-bit environment but uses locking shift or
6165 single shift functions. Assigned the coding-system (Lisp
6166 symbol) `iso-2022-7bit-lock' by default.
6168 o coding-category-iso-8-else
6170 The category for a coding system which has the same code range
6171 as ISO2022 of 8-bit environment but uses locking shift or
6172 single shift functions. Assigned the coding-system (Lisp
6173 symbol) `iso-2022-8bit-ss2' by default.
6175 o coding-category-big5
6177 The category for a coding system which has the same code range
6178 as BIG5. Assigned the coding-system (Lisp symbol)
6179 `cn-big5' by default.
6181 o coding-category-utf-8
6183 The category for a coding system which has the same code range
6184 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6185 symbol) `utf-8' by default.
6187 o coding-category-utf-16-be
6189 The category for a coding system in which a text has an
6190 Unicode signature (cf. Unicode Standard) in the order of BIG
6191 endian at the head. Assigned the coding-system (Lisp symbol)
6192 `utf-16-be' by default.
6194 o coding-category-utf-16-le
6196 The category for a coding system in which a text has an
6197 Unicode signature (cf. Unicode Standard) in the order of
6198 LITTLE endian at the head. Assigned the coding-system (Lisp
6199 symbol) `utf-16-le' by default.
6201 o coding-category-ccl
6203 The category for a coding system of which encoder/decoder is
6204 written in CCL programs. The default value is nil, i.e., no
6205 coding system is assigned.
6207 o coding-category-binary
6209 The category for a coding system not categorized in any of the
6210 above. Assigned the coding-system (Lisp symbol)
6211 `no-conversion' by default.
6213 Each of them is a Lisp symbol and the value is an actual
6214 `coding-system's (this is also a Lisp symbol) assigned by a user.
6215 What Emacs does actually is to detect a category of coding system.
6216 Then, it uses a `coding-system' assigned to it. If Emacs can't
6217 decide only one possible category, it selects a category of the
6218 highest priority. Priorities of categories are also specified by a
6219 user in a Lisp variable `coding-category-list'.
6223 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6224 int eol_seen);
6227 /* Return the number of ASCII characters at the head of the source.
6228 By side effects, set coding->head_ascii and update
6229 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6230 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6231 reliable only when all the source bytes are ASCII. */
6233 static ptrdiff_t
6234 check_ascii (struct coding_system *coding)
6236 const unsigned char *src, *end;
6237 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6238 int eol_seen = coding->eol_seen;
6240 coding_set_source (coding);
6241 src = coding->source;
6242 end = src + coding->src_bytes;
6244 if (inhibit_eol_conversion
6245 || SYMBOLP (eol_type))
6247 /* We don't have to check EOL format. */
6248 while (src < end && !( *src & 0x80))
6250 if (*src++ == '\n')
6251 eol_seen |= EOL_SEEN_LF;
6254 else
6256 end--; /* We look ahead one byte for "CR LF". */
6257 while (src < end)
6259 int c = *src;
6261 if (c & 0x80)
6262 break;
6263 src++;
6264 if (c == '\r')
6266 if (*src == '\n')
6268 eol_seen |= EOL_SEEN_CRLF;
6269 src++;
6271 else
6272 eol_seen |= EOL_SEEN_CR;
6274 else if (c == '\n')
6275 eol_seen |= EOL_SEEN_LF;
6277 if (src == end)
6279 int c = *src;
6281 /* All bytes but the last one C are ASCII. */
6282 if (! (c & 0x80))
6284 if (c == '\r')
6285 eol_seen |= EOL_SEEN_CR;
6286 else if (c == '\n')
6287 eol_seen |= EOL_SEEN_LF;
6288 src++;
6292 coding->head_ascii = src - coding->source;
6293 coding->eol_seen = eol_seen;
6294 return (coding->head_ascii);
6298 /* Return the number of characters at the source if all the bytes are
6299 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6300 effects, update coding->eol_seen. The value of coding->eol_seen is
6301 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6302 the value is reliable only when all the source bytes are valid
6303 UTF-8. */
6305 static ptrdiff_t
6306 check_utf_8 (struct coding_system *coding)
6308 const unsigned char *src, *end;
6309 int eol_seen;
6310 ptrdiff_t nchars = coding->head_ascii;
6312 if (coding->head_ascii < 0)
6313 check_ascii (coding);
6314 else
6315 coding_set_source (coding);
6316 src = coding->source + coding->head_ascii;
6317 /* We look ahead one byte for CR LF. */
6318 end = coding->source + coding->src_bytes - 1;
6319 eol_seen = coding->eol_seen;
6320 while (src < end)
6322 int c = *src;
6324 if (UTF_8_1_OCTET_P (*src))
6326 src++;
6327 if (c < 0x20)
6329 if (c == '\r')
6331 if (*src == '\n')
6333 eol_seen |= EOL_SEEN_CRLF;
6334 src++;
6335 nchars++;
6337 else
6338 eol_seen |= EOL_SEEN_CR;
6340 else if (c == '\n')
6341 eol_seen |= EOL_SEEN_LF;
6344 else if (UTF_8_2_OCTET_LEADING_P (c))
6346 if (c < 0xC2 /* overlong sequence */
6347 || src + 1 >= end
6348 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6349 return -1;
6350 src += 2;
6352 else if (UTF_8_3_OCTET_LEADING_P (c))
6354 if (src + 2 >= end
6355 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6356 && UTF_8_EXTRA_OCTET_P (src[2])))
6357 return -1;
6358 c = (((c & 0xF) << 12)
6359 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6360 if (c < 0x800 /* overlong sequence */
6361 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6362 return -1;
6363 src += 3;
6365 else if (UTF_8_4_OCTET_LEADING_P (c))
6367 if (src + 3 >= end
6368 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6369 && UTF_8_EXTRA_OCTET_P (src[2])
6370 && UTF_8_EXTRA_OCTET_P (src[3])))
6371 return -1;
6372 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6373 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6374 if (c < 0x10000 /* overlong sequence */
6375 || c >= 0x110000) /* non-Unicode character */
6376 return -1;
6377 src += 4;
6379 else
6380 return -1;
6381 nchars++;
6384 if (src == end)
6386 if (! UTF_8_1_OCTET_P (*src))
6387 return -1;
6388 nchars++;
6389 if (*src == '\r')
6390 eol_seen |= EOL_SEEN_CR;
6391 else if (*src == '\n')
6392 eol_seen |= EOL_SEEN_LF;
6394 coding->eol_seen = eol_seen;
6395 return nchars;
6399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6400 SOURCE is encoded. If CATEGORY is one of
6401 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6402 two-byte, else they are encoded by one-byte.
6404 Return one of EOL_SEEN_XXX. */
6406 #define MAX_EOL_CHECK_COUNT 3
6408 static int
6409 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6410 enum coding_category category)
6412 const unsigned char *src = source, *src_end = src + src_bytes;
6413 unsigned char c;
6414 int total = 0;
6415 int eol_seen = EOL_SEEN_NONE;
6417 if ((1 << category) & CATEGORY_MASK_UTF_16)
6419 bool msb = category == (coding_category_utf_16_le
6420 | coding_category_utf_16_le_nosig);
6421 bool lsb = !msb;
6423 while (src + 1 < src_end)
6425 c = src[lsb];
6426 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6428 int this_eol;
6430 if (c == '\n')
6431 this_eol = EOL_SEEN_LF;
6432 else if (src + 3 >= src_end
6433 || src[msb + 2] != 0
6434 || src[lsb + 2] != '\n')
6435 this_eol = EOL_SEEN_CR;
6436 else
6438 this_eol = EOL_SEEN_CRLF;
6439 src += 2;
6442 if (eol_seen == EOL_SEEN_NONE)
6443 /* This is the first end-of-line. */
6444 eol_seen = this_eol;
6445 else if (eol_seen != this_eol)
6447 /* The found type is different from what found before.
6448 Allow for stray ^M characters in DOS EOL files. */
6449 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6450 || (eol_seen == EOL_SEEN_CRLF
6451 && this_eol == EOL_SEEN_CR))
6452 eol_seen = EOL_SEEN_CRLF;
6453 else
6455 eol_seen = EOL_SEEN_LF;
6456 break;
6459 if (++total == MAX_EOL_CHECK_COUNT)
6460 break;
6462 src += 2;
6465 else
6466 while (src < src_end)
6468 c = *src++;
6469 if (c == '\n' || c == '\r')
6471 int this_eol;
6473 if (c == '\n')
6474 this_eol = EOL_SEEN_LF;
6475 else if (src >= src_end || *src != '\n')
6476 this_eol = EOL_SEEN_CR;
6477 else
6478 this_eol = EOL_SEEN_CRLF, src++;
6480 if (eol_seen == EOL_SEEN_NONE)
6481 /* This is the first end-of-line. */
6482 eol_seen = this_eol;
6483 else if (eol_seen != this_eol)
6485 /* The found type is different from what found before.
6486 Allow for stray ^M characters in DOS EOL files. */
6487 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6488 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6489 eol_seen = EOL_SEEN_CRLF;
6490 else
6492 eol_seen = EOL_SEEN_LF;
6493 break;
6496 if (++total == MAX_EOL_CHECK_COUNT)
6497 break;
6500 return eol_seen;
6504 static Lisp_Object
6505 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6507 Lisp_Object eol_type;
6509 eol_type = CODING_ID_EOL_TYPE (coding->id);
6510 if (! VECTORP (eol_type))
6511 /* Already adjusted. */
6512 return eol_type;
6513 if (eol_seen & EOL_SEEN_LF)
6515 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6516 eol_type = Qunix;
6518 else if (eol_seen & EOL_SEEN_CRLF)
6520 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6521 eol_type = Qdos;
6523 else if (eol_seen & EOL_SEEN_CR)
6525 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6526 eol_type = Qmac;
6528 return eol_type;
6531 /* Detect how a text specified in CODING is encoded. If a coding
6532 system is detected, update fields of CODING by the detected coding
6533 system. */
6535 static void
6536 detect_coding (struct coding_system *coding)
6538 const unsigned char *src, *src_end;
6539 unsigned int saved_mode = coding->mode;
6540 Lisp_Object found = Qnil;
6541 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6543 coding->consumed = coding->consumed_char = 0;
6544 coding->produced = coding->produced_char = 0;
6545 coding_set_source (coding);
6547 src_end = coding->source + coding->src_bytes;
6549 coding->eol_seen = EOL_SEEN_NONE;
6550 /* If we have not yet decided the text encoding type, detect it
6551 now. */
6552 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6554 int c, i;
6555 struct coding_detection_info detect_info;
6556 bool null_byte_found = 0, eight_bit_found = 0;
6557 bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6558 inhibit_null_byte_detection);
6559 bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6560 inhibit_iso_escape_detection);
6561 bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6563 coding->head_ascii = 0;
6564 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6565 for (src = coding->source; src < src_end; src++)
6567 c = *src;
6568 if (c & 0x80)
6570 eight_bit_found = 1;
6571 if (null_byte_found)
6572 break;
6574 else if (c < 0x20)
6576 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6577 && ! inhibit_ied
6578 && ! detect_info.checked)
6580 if (detect_coding_iso_2022 (coding, &detect_info))
6582 /* We have scanned the whole data. */
6583 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6585 /* We didn't find an 8-bit code. We may
6586 have found a null-byte, but it's very
6587 rare that a binary file conforms to
6588 ISO-2022. */
6589 src = src_end;
6590 coding->head_ascii = src - coding->source;
6592 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6593 break;
6596 else if (! c && !inhibit_nbd)
6598 null_byte_found = 1;
6599 if (eight_bit_found)
6600 break;
6602 else if (! disable_ascii_optimization
6603 && ! inhibit_eol_conversion)
6605 if (c == '\r')
6607 if (src < src_end && src[1] == '\n')
6609 coding->eol_seen |= EOL_SEEN_CRLF;
6610 src++;
6611 if (! eight_bit_found)
6612 coding->head_ascii++;
6614 else
6615 coding->eol_seen |= EOL_SEEN_CR;
6617 else if (c == '\n')
6619 coding->eol_seen |= EOL_SEEN_LF;
6623 if (! eight_bit_found)
6624 coding->head_ascii++;
6626 else if (! eight_bit_found)
6627 coding->head_ascii++;
6630 if (null_byte_found || eight_bit_found
6631 || coding->head_ascii < coding->src_bytes
6632 || detect_info.found)
6634 enum coding_category category;
6635 struct coding_system *this;
6637 if (coding->head_ascii == coding->src_bytes)
6638 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6639 for (i = 0; i < coding_category_raw_text; i++)
6641 category = coding_priorities[i];
6642 this = coding_categories + category;
6643 if (detect_info.found & (1 << category))
6644 break;
6646 else
6648 if (null_byte_found)
6650 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6651 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6653 else if (prefer_utf_8
6654 && detect_coding_utf_8 (coding, &detect_info))
6656 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6657 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6659 for (i = 0; i < coding_category_raw_text; i++)
6661 category = coding_priorities[i];
6662 this = coding_categories + category;
6663 /* Some of this->detector (e.g. detect_coding_sjis)
6664 require this information. */
6665 coding->id = this->id;
6666 if (this->id < 0)
6668 /* No coding system of this category is defined. */
6669 detect_info.rejected |= (1 << category);
6671 else if (category >= coding_category_raw_text)
6672 continue;
6673 else if (detect_info.checked & (1 << category))
6675 if (detect_info.found & (1 << category))
6676 break;
6678 else if ((*(this->detector)) (coding, &detect_info)
6679 && detect_info.found & (1 << category))
6680 break;
6684 if (i < coding_category_raw_text)
6686 if (category == coding_category_utf_8_auto)
6688 Lisp_Object coding_systems;
6690 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6691 coding_attr_utf_bom);
6692 if (CONSP (coding_systems))
6694 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6695 found = XCAR (coding_systems);
6696 else
6697 found = XCDR (coding_systems);
6699 else
6700 found = CODING_ID_NAME (this->id);
6702 else if (category == coding_category_utf_16_auto)
6704 Lisp_Object coding_systems;
6706 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6707 coding_attr_utf_bom);
6708 if (CONSP (coding_systems))
6710 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6711 found = XCAR (coding_systems);
6712 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6713 found = XCDR (coding_systems);
6715 else
6716 found = CODING_ID_NAME (this->id);
6718 else
6719 found = CODING_ID_NAME (this->id);
6721 else if (null_byte_found)
6722 found = Qno_conversion;
6723 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6724 == CATEGORY_MASK_ANY)
6725 found = Qraw_text;
6726 else if (detect_info.rejected)
6727 for (i = 0; i < coding_category_raw_text; i++)
6728 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6730 this = coding_categories + coding_priorities[i];
6731 found = CODING_ID_NAME (this->id);
6732 break;
6736 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6737 == coding_category_utf_8_auto)
6739 Lisp_Object coding_systems;
6740 struct coding_detection_info detect_info;
6742 coding_systems
6743 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6744 detect_info.found = detect_info.rejected = 0;
6745 if (check_ascii (coding) == coding->src_bytes)
6747 if (CONSP (coding_systems))
6748 found = XCDR (coding_systems);
6750 else
6752 if (CONSP (coding_systems)
6753 && detect_coding_utf_8 (coding, &detect_info))
6755 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6756 found = XCAR (coding_systems);
6757 else
6758 found = XCDR (coding_systems);
6762 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6763 == coding_category_utf_16_auto)
6765 Lisp_Object coding_systems;
6766 struct coding_detection_info detect_info;
6768 coding_systems
6769 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6770 detect_info.found = detect_info.rejected = 0;
6771 coding->head_ascii = 0;
6772 if (CONSP (coding_systems)
6773 && detect_coding_utf_16 (coding, &detect_info))
6775 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6776 found = XCAR (coding_systems);
6777 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6778 found = XCDR (coding_systems);
6782 if (! NILP (found))
6784 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6785 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6786 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6787 : EOL_SEEN_LF);
6789 setup_coding_system (found, coding);
6790 if (specified_eol != EOL_SEEN_NONE)
6791 adjust_coding_eol_type (coding, specified_eol);
6794 coding->mode = saved_mode;
6798 static void
6799 decode_eol (struct coding_system *coding)
6801 Lisp_Object eol_type;
6802 unsigned char *p, *pbeg, *pend;
6804 eol_type = CODING_ID_EOL_TYPE (coding->id);
6805 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6806 return;
6808 if (NILP (coding->dst_object))
6809 pbeg = coding->destination;
6810 else
6811 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6812 pend = pbeg + coding->produced;
6814 if (VECTORP (eol_type))
6816 int eol_seen = EOL_SEEN_NONE;
6818 for (p = pbeg; p < pend; p++)
6820 if (*p == '\n')
6821 eol_seen |= EOL_SEEN_LF;
6822 else if (*p == '\r')
6824 if (p + 1 < pend && *(p + 1) == '\n')
6826 eol_seen |= EOL_SEEN_CRLF;
6827 p++;
6829 else
6830 eol_seen |= EOL_SEEN_CR;
6833 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6834 if ((eol_seen & EOL_SEEN_CRLF) != 0
6835 && (eol_seen & EOL_SEEN_CR) != 0
6836 && (eol_seen & EOL_SEEN_LF) == 0)
6837 eol_seen = EOL_SEEN_CRLF;
6838 else if (eol_seen != EOL_SEEN_NONE
6839 && eol_seen != EOL_SEEN_LF
6840 && eol_seen != EOL_SEEN_CRLF
6841 && eol_seen != EOL_SEEN_CR)
6842 eol_seen = EOL_SEEN_LF;
6843 if (eol_seen != EOL_SEEN_NONE)
6844 eol_type = adjust_coding_eol_type (coding, eol_seen);
6847 if (EQ (eol_type, Qmac))
6849 for (p = pbeg; p < pend; p++)
6850 if (*p == '\r')
6851 *p = '\n';
6853 else if (EQ (eol_type, Qdos))
6855 ptrdiff_t n = 0;
6857 if (NILP (coding->dst_object))
6859 /* Start deleting '\r' from the tail to minimize the memory
6860 movement. */
6861 for (p = pend - 2; p >= pbeg; p--)
6862 if (*p == '\r')
6864 memmove (p, p + 1, pend-- - p - 1);
6865 n++;
6868 else
6870 ptrdiff_t pos_byte = coding->dst_pos_byte;
6871 ptrdiff_t pos = coding->dst_pos;
6872 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6874 while (pos < pos_end)
6876 p = BYTE_POS_ADDR (pos_byte);
6877 if (*p == '\r' && p[1] == '\n')
6879 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6880 n++;
6881 pos_end--;
6883 pos++;
6884 if (coding->dst_multibyte)
6885 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6886 else
6887 pos_byte++;
6890 coding->produced -= n;
6891 coding->produced_char -= n;
6896 /* Return a translation table (or list of them) from coding system
6897 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6898 not ENCODEP). */
6900 static Lisp_Object
6901 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6903 Lisp_Object standard, translation_table;
6904 Lisp_Object val;
6906 if (NILP (Venable_character_translation))
6908 if (max_lookup)
6909 *max_lookup = 0;
6910 return Qnil;
6912 if (encodep)
6913 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6914 standard = Vstandard_translation_table_for_encode;
6915 else
6916 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6917 standard = Vstandard_translation_table_for_decode;
6918 if (NILP (translation_table))
6919 translation_table = standard;
6920 else
6922 if (SYMBOLP (translation_table))
6923 translation_table = Fget (translation_table, Qtranslation_table);
6924 else if (CONSP (translation_table))
6926 translation_table = Fcopy_sequence (translation_table);
6927 for (val = translation_table; CONSP (val); val = XCDR (val))
6928 if (SYMBOLP (XCAR (val)))
6929 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6931 if (CHAR_TABLE_P (standard))
6933 if (CONSP (translation_table))
6934 translation_table = nconc2 (translation_table, list1 (standard));
6935 else
6936 translation_table = list2 (translation_table, standard);
6940 if (max_lookup)
6942 *max_lookup = 1;
6943 if (CHAR_TABLE_P (translation_table)
6944 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6946 val = XCHAR_TABLE (translation_table)->extras[1];
6947 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6948 *max_lookup = XFASTINT (val);
6950 else if (CONSP (translation_table))
6952 Lisp_Object tail;
6954 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6955 if (CHAR_TABLE_P (XCAR (tail))
6956 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6958 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6959 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6960 *max_lookup = XFASTINT (tailval);
6964 return translation_table;
6967 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6968 do { \
6969 trans = Qnil; \
6970 if (CHAR_TABLE_P (table)) \
6972 trans = CHAR_TABLE_REF (table, c); \
6973 if (CHARACTERP (trans)) \
6974 c = XFASTINT (trans), trans = Qnil; \
6976 else if (CONSP (table)) \
6978 Lisp_Object tail; \
6980 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6981 if (CHAR_TABLE_P (XCAR (tail))) \
6983 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6984 if (CHARACTERP (trans)) \
6985 c = XFASTINT (trans), trans = Qnil; \
6986 else if (! NILP (trans)) \
6987 break; \
6990 } while (0)
6993 /* Return a translation of character(s) at BUF according to TRANS.
6994 TRANS is TO-CHAR or ((FROM . TO) ...) where
6995 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6996 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6997 translation is found, and Qnil if not found..
6998 If BUF is too short to lookup characters in FROM, return Qt. */
7000 static Lisp_Object
7001 get_translation (Lisp_Object trans, int *buf, int *buf_end)
7004 if (INTEGERP (trans))
7005 return trans;
7006 for (; CONSP (trans); trans = XCDR (trans))
7008 Lisp_Object val = XCAR (trans);
7009 Lisp_Object from = XCAR (val);
7010 ptrdiff_t len = ASIZE (from);
7011 ptrdiff_t i;
7013 for (i = 0; i < len; i++)
7015 if (buf + i == buf_end)
7016 return Qt;
7017 if (XINT (AREF (from, i)) != buf[i])
7018 break;
7020 if (i == len)
7021 return val;
7023 return Qnil;
7027 static int
7028 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7029 bool last_block)
7031 unsigned char *dst = coding->destination + coding->produced;
7032 unsigned char *dst_end = coding->destination + coding->dst_bytes;
7033 ptrdiff_t produced;
7034 ptrdiff_t produced_chars = 0;
7035 int carryover = 0;
7037 if (! coding->chars_at_source)
7039 /* Source characters are in coding->charbuf. */
7040 int *buf = coding->charbuf;
7041 int *buf_end = buf + coding->charbuf_used;
7043 if (EQ (coding->src_object, coding->dst_object))
7045 coding_set_source (coding);
7046 dst_end = ((unsigned char *) coding->source) + coding->consumed;
7049 while (buf < buf_end)
7051 int c = *buf;
7052 ptrdiff_t i;
7054 if (c >= 0)
7056 ptrdiff_t from_nchars = 1, to_nchars = 1;
7057 Lisp_Object trans = Qnil;
7059 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7060 if (! NILP (trans))
7062 trans = get_translation (trans, buf, buf_end);
7063 if (INTEGERP (trans))
7064 c = XINT (trans);
7065 else if (CONSP (trans))
7067 from_nchars = ASIZE (XCAR (trans));
7068 trans = XCDR (trans);
7069 if (INTEGERP (trans))
7070 c = XINT (trans);
7071 else
7073 to_nchars = ASIZE (trans);
7074 c = XINT (AREF (trans, 0));
7077 else if (EQ (trans, Qt) && ! last_block)
7078 break;
7081 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7083 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7084 / MAX_MULTIBYTE_LENGTH)
7085 < to_nchars)
7086 memory_full (SIZE_MAX);
7087 dst = alloc_destination (coding,
7088 buf_end - buf
7089 + MAX_MULTIBYTE_LENGTH * to_nchars,
7090 dst);
7091 if (EQ (coding->src_object, coding->dst_object))
7093 coding_set_source (coding);
7094 dst_end = (((unsigned char *) coding->source)
7095 + coding->consumed);
7097 else
7098 dst_end = coding->destination + coding->dst_bytes;
7101 for (i = 0; i < to_nchars; i++)
7103 if (i > 0)
7104 c = XINT (AREF (trans, i));
7105 if (coding->dst_multibyte
7106 || ! CHAR_BYTE8_P (c))
7107 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7108 else
7109 *dst++ = CHAR_TO_BYTE8 (c);
7111 produced_chars += to_nchars;
7112 buf += from_nchars;
7114 else
7115 /* This is an annotation datum. (-C) is the length. */
7116 buf += -c;
7118 carryover = buf_end - buf;
7120 else
7122 /* Source characters are at coding->source. */
7123 const unsigned char *src = coding->source;
7124 const unsigned char *src_end = src + coding->consumed;
7126 if (EQ (coding->dst_object, coding->src_object))
7127 dst_end = (unsigned char *) src;
7128 if (coding->src_multibyte != coding->dst_multibyte)
7130 if (coding->src_multibyte)
7132 bool multibytep = 1;
7133 ptrdiff_t consumed_chars = 0;
7135 while (1)
7137 const unsigned char *src_base = src;
7138 int c;
7140 ONE_MORE_BYTE (c);
7141 if (dst == dst_end)
7143 if (EQ (coding->src_object, coding->dst_object))
7144 dst_end = (unsigned char *) src;
7145 if (dst == dst_end)
7147 ptrdiff_t offset = src - coding->source;
7149 dst = alloc_destination (coding, src_end - src + 1,
7150 dst);
7151 dst_end = coding->destination + coding->dst_bytes;
7152 coding_set_source (coding);
7153 src = coding->source + offset;
7154 src_end = coding->source + coding->consumed;
7155 if (EQ (coding->src_object, coding->dst_object))
7156 dst_end = (unsigned char *) src;
7159 *dst++ = c;
7160 produced_chars++;
7162 no_more_source:
7165 else
7166 while (src < src_end)
7168 bool multibytep = 1;
7169 int c = *src++;
7171 if (dst >= dst_end - 1)
7173 if (EQ (coding->src_object, coding->dst_object))
7174 dst_end = (unsigned char *) src;
7175 if (dst >= dst_end - 1)
7177 ptrdiff_t offset = src - coding->source;
7178 ptrdiff_t more_bytes;
7180 if (EQ (coding->src_object, coding->dst_object))
7181 more_bytes = ((src_end - src) / 2) + 2;
7182 else
7183 more_bytes = src_end - src + 2;
7184 dst = alloc_destination (coding, more_bytes, dst);
7185 dst_end = coding->destination + coding->dst_bytes;
7186 coding_set_source (coding);
7187 src = coding->source + offset;
7188 src_end = coding->source + coding->consumed;
7189 if (EQ (coding->src_object, coding->dst_object))
7190 dst_end = (unsigned char *) src;
7193 EMIT_ONE_BYTE (c);
7196 else
7198 if (!EQ (coding->src_object, coding->dst_object))
7200 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7202 if (require > 0)
7204 ptrdiff_t offset = src - coding->source;
7206 dst = alloc_destination (coding, require, dst);
7207 coding_set_source (coding);
7208 src = coding->source + offset;
7209 src_end = coding->source + coding->consumed;
7212 produced_chars = coding->consumed_char;
7213 while (src < src_end)
7214 *dst++ = *src++;
7218 produced = dst - (coding->destination + coding->produced);
7219 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7220 insert_from_gap (produced_chars, produced, 0);
7221 coding->produced += produced;
7222 coding->produced_char += produced_chars;
7223 return carryover;
7226 /* Compose text in CODING->object according to the annotation data at
7227 CHARBUF. CHARBUF is an array:
7228 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7231 static void
7232 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7234 int len;
7235 ptrdiff_t to;
7236 enum composition_method method;
7237 Lisp_Object components;
7239 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7240 to = pos + charbuf[2];
7241 method = (enum composition_method) (charbuf[4]);
7243 if (method == COMPOSITION_RELATIVE)
7244 components = Qnil;
7245 else
7247 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7248 int i, j;
7250 if (method == COMPOSITION_WITH_RULE)
7251 len = charbuf[2] * 3 - 2;
7252 charbuf += MAX_ANNOTATION_LENGTH;
7253 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7254 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7256 if (charbuf[i] >= 0)
7257 args[j] = make_number (charbuf[i]);
7258 else
7260 i++;
7261 args[j] = make_number (charbuf[i] % 0x100);
7264 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7266 compose_text (pos, to, components, Qnil, coding->dst_object);
7270 /* Put `charset' property on text in CODING->object according to
7271 the annotation data at CHARBUF. CHARBUF is an array:
7272 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7275 static void
7276 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7278 ptrdiff_t from = pos - charbuf[2];
7279 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7281 Fput_text_property (make_number (from), make_number (pos),
7282 Qcharset, CHARSET_NAME (charset),
7283 coding->dst_object);
7287 #define CHARBUF_SIZE 0x4000
7289 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7290 do { \
7291 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7292 coding->charbuf_size = CHARBUF_SIZE; \
7293 } while (0)
7296 static void
7297 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7299 int *charbuf = coding->charbuf;
7300 int *charbuf_end = charbuf + coding->charbuf_used;
7302 if (NILP (coding->dst_object))
7303 return;
7305 while (charbuf < charbuf_end)
7307 if (*charbuf >= 0)
7308 pos++, charbuf++;
7309 else
7311 int len = -*charbuf;
7313 if (len > 2)
7314 switch (charbuf[1])
7316 case CODING_ANNOTATE_COMPOSITION_MASK:
7317 produce_composition (coding, charbuf, pos);
7318 break;
7319 case CODING_ANNOTATE_CHARSET_MASK:
7320 produce_charset (coding, charbuf, pos);
7321 break;
7323 charbuf += len;
7328 /* Decode the data at CODING->src_object into CODING->dst_object.
7329 CODING->src_object is a buffer, a string, or nil.
7330 CODING->dst_object is a buffer.
7332 If CODING->src_object is a buffer, it must be the current buffer.
7333 In this case, if CODING->src_pos is positive, it is a position of
7334 the source text in the buffer, otherwise, the source text is in the
7335 gap area of the buffer, and CODING->src_pos specifies the offset of
7336 the text from GPT (which must be the same as PT). If this is the
7337 same buffer as CODING->dst_object, CODING->src_pos must be
7338 negative.
7340 If CODING->src_object is a string, CODING->src_pos is an index to
7341 that string.
7343 If CODING->src_object is nil, CODING->source must already point to
7344 the non-relocatable memory area. In this case, CODING->src_pos is
7345 an offset from CODING->source.
7347 The decoded data is inserted at the current point of the buffer
7348 CODING->dst_object.
7351 static void
7352 decode_coding (struct coding_system *coding)
7354 Lisp_Object attrs;
7355 Lisp_Object undo_list;
7356 Lisp_Object translation_table;
7357 struct ccl_spec cclspec;
7358 int carryover;
7359 int i;
7361 USE_SAFE_ALLOCA;
7363 if (BUFFERP (coding->src_object)
7364 && coding->src_pos > 0
7365 && coding->src_pos < GPT
7366 && coding->src_pos + coding->src_chars > GPT)
7367 move_gap_both (coding->src_pos, coding->src_pos_byte);
7369 undo_list = Qt;
7370 if (BUFFERP (coding->dst_object))
7372 set_buffer_internal (XBUFFER (coding->dst_object));
7373 if (GPT != PT)
7374 move_gap_both (PT, PT_BYTE);
7376 /* We must disable undo_list in order to record the whole insert
7377 transaction via record_insert at the end. But doing so also
7378 disables the recording of the first change to the undo_list.
7379 Therefore we check for first change here and record it via
7380 record_first_change if needed. */
7381 if (MODIFF <= SAVE_MODIFF)
7382 record_first_change ();
7384 undo_list = BVAR (current_buffer, undo_list);
7385 bset_undo_list (current_buffer, Qt);
7388 coding->consumed = coding->consumed_char = 0;
7389 coding->produced = coding->produced_char = 0;
7390 coding->chars_at_source = 0;
7391 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7392 coding->errors = 0;
7394 ALLOC_CONVERSION_WORK_AREA (coding);
7396 attrs = CODING_ID_ATTRS (coding->id);
7397 translation_table = get_translation_table (attrs, 0, NULL);
7399 carryover = 0;
7400 if (coding->decoder == decode_coding_ccl)
7402 coding->spec.ccl = &cclspec;
7403 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7407 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7409 coding_set_source (coding);
7410 coding->annotated = 0;
7411 coding->charbuf_used = carryover;
7412 (*(coding->decoder)) (coding);
7413 coding_set_destination (coding);
7414 carryover = produce_chars (coding, translation_table, 0);
7415 if (coding->annotated)
7416 produce_annotation (coding, pos);
7417 for (i = 0; i < carryover; i++)
7418 coding->charbuf[i]
7419 = coding->charbuf[coding->charbuf_used - carryover + i];
7421 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7422 || (coding->consumed < coding->src_bytes
7423 && (coding->result == CODING_RESULT_SUCCESS
7424 || coding->result == CODING_RESULT_INVALID_SRC)));
7426 if (carryover > 0)
7428 coding_set_destination (coding);
7429 coding->charbuf_used = carryover;
7430 produce_chars (coding, translation_table, 1);
7433 coding->carryover_bytes = 0;
7434 if (coding->consumed < coding->src_bytes)
7436 ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7437 const unsigned char *src;
7439 coding_set_source (coding);
7440 coding_set_destination (coding);
7441 src = coding->source + coding->consumed;
7443 if (coding->mode & CODING_MODE_LAST_BLOCK)
7445 /* Flush out unprocessed data as binary chars. We are sure
7446 that the number of data is less than the size of
7447 coding->charbuf. */
7448 coding->charbuf_used = 0;
7449 coding->chars_at_source = 0;
7451 while (nbytes-- > 0)
7453 int c = *src++;
7455 if (c & 0x80)
7456 c = BYTE8_TO_CHAR (c);
7457 coding->charbuf[coding->charbuf_used++] = c;
7459 produce_chars (coding, Qnil, 1);
7461 else
7463 /* Record unprocessed bytes in coding->carryover. We are
7464 sure that the number of data is less than the size of
7465 coding->carryover. */
7466 unsigned char *p = coding->carryover;
7468 if (nbytes > sizeof coding->carryover)
7469 nbytes = sizeof coding->carryover;
7470 coding->carryover_bytes = nbytes;
7471 while (nbytes-- > 0)
7472 *p++ = *src++;
7474 coding->consumed = coding->src_bytes;
7477 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7478 && !inhibit_eol_conversion)
7479 decode_eol (coding);
7480 if (BUFFERP (coding->dst_object))
7482 bset_undo_list (current_buffer, undo_list);
7483 record_insert (coding->dst_pos, coding->produced_char);
7486 SAFE_FREE ();
7490 /* Extract an annotation datum from a composition starting at POS and
7491 ending before LIMIT of CODING->src_object (buffer or string), store
7492 the data in BUF, set *STOP to a starting position of the next
7493 composition (if any) or to LIMIT, and return the address of the
7494 next element of BUF.
7496 If such an annotation is not found, set *STOP to a starting
7497 position of a composition after POS (if any) or to LIMIT, and
7498 return BUF. */
7500 static int *
7501 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7502 struct coding_system *coding, int *buf,
7503 ptrdiff_t *stop)
7505 ptrdiff_t start, end;
7506 Lisp_Object prop;
7508 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7509 || end > limit)
7510 *stop = limit;
7511 else if (start > pos)
7512 *stop = start;
7513 else
7515 if (start == pos)
7517 /* We found a composition. Store the corresponding
7518 annotation data in BUF. */
7519 int *head = buf;
7520 enum composition_method method = composition_method (prop);
7521 int nchars = COMPOSITION_LENGTH (prop);
7523 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7524 if (method != COMPOSITION_RELATIVE)
7526 Lisp_Object components;
7527 ptrdiff_t i, len, i_byte;
7529 components = COMPOSITION_COMPONENTS (prop);
7530 if (VECTORP (components))
7532 len = ASIZE (components);
7533 for (i = 0; i < len; i++)
7534 *buf++ = XINT (AREF (components, i));
7536 else if (STRINGP (components))
7538 len = SCHARS (components);
7539 i = i_byte = 0;
7540 while (i < len)
7542 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7543 buf++;
7546 else if (INTEGERP (components))
7548 len = 1;
7549 *buf++ = XINT (components);
7551 else if (CONSP (components))
7553 for (len = 0; CONSP (components);
7554 len++, components = XCDR (components))
7555 *buf++ = XINT (XCAR (components));
7557 else
7558 emacs_abort ();
7559 *head -= len;
7563 if (find_composition (end, limit, &start, &end, &prop,
7564 coding->src_object)
7565 && end <= limit)
7566 *stop = start;
7567 else
7568 *stop = limit;
7570 return buf;
7574 /* Extract an annotation datum from a text property `charset' at POS of
7575 CODING->src_object (buffer of string), store the data in BUF, set
7576 *STOP to the position where the value of `charset' property changes
7577 (limiting by LIMIT), and return the address of the next element of
7578 BUF.
7580 If the property value is nil, set *STOP to the position where the
7581 property value is non-nil (limiting by LIMIT), and return BUF. */
7583 static int *
7584 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7585 struct coding_system *coding, int *buf,
7586 ptrdiff_t *stop)
7588 Lisp_Object val, next;
7589 int id;
7591 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7592 if (! NILP (val) && CHARSETP (val))
7593 id = XINT (CHARSET_SYMBOL_ID (val));
7594 else
7595 id = -1;
7596 ADD_CHARSET_DATA (buf, 0, id);
7597 next = Fnext_single_property_change (make_number (pos), Qcharset,
7598 coding->src_object,
7599 make_number (limit));
7600 *stop = XINT (next);
7601 return buf;
7605 static void
7606 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7607 int max_lookup)
7609 int *buf = coding->charbuf;
7610 int *buf_end = coding->charbuf + coding->charbuf_size;
7611 const unsigned char *src = coding->source + coding->consumed;
7612 const unsigned char *src_end = coding->source + coding->src_bytes;
7613 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7614 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7615 bool multibytep = coding->src_multibyte;
7616 Lisp_Object eol_type;
7617 int c;
7618 ptrdiff_t stop, stop_composition, stop_charset;
7619 int *lookup_buf = NULL;
7621 if (! NILP (translation_table))
7622 lookup_buf = alloca (sizeof (int) * max_lookup);
7624 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7625 if (VECTORP (eol_type))
7626 eol_type = Qunix;
7628 /* Note: composition handling is not yet implemented. */
7629 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7631 if (NILP (coding->src_object))
7632 stop = stop_composition = stop_charset = end_pos;
7633 else
7635 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7636 stop = stop_composition = pos;
7637 else
7638 stop = stop_composition = end_pos;
7639 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7640 stop = stop_charset = pos;
7641 else
7642 stop_charset = end_pos;
7645 /* Compensate for CRLF and conversion. */
7646 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7647 while (buf < buf_end)
7649 Lisp_Object trans;
7651 if (pos == stop)
7653 if (pos == end_pos)
7654 break;
7655 if (pos == stop_composition)
7656 buf = handle_composition_annotation (pos, end_pos, coding,
7657 buf, &stop_composition);
7658 if (pos == stop_charset)
7659 buf = handle_charset_annotation (pos, end_pos, coding,
7660 buf, &stop_charset);
7661 stop = (stop_composition < stop_charset
7662 ? stop_composition : stop_charset);
7665 if (! multibytep)
7667 int bytes;
7669 if (coding->encoder == encode_coding_raw_text
7670 || coding->encoder == encode_coding_ccl)
7671 c = *src++, pos++;
7672 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7673 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7674 else
7675 c = BYTE8_TO_CHAR (*src), src++, pos++;
7677 else
7678 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7679 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7680 c = '\n';
7681 if (! EQ (eol_type, Qunix))
7683 if (c == '\n')
7685 if (EQ (eol_type, Qdos))
7686 *buf++ = '\r';
7687 else
7688 c = '\r';
7692 trans = Qnil;
7693 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7694 if (NILP (trans))
7695 *buf++ = c;
7696 else
7698 ptrdiff_t from_nchars = 1, to_nchars = 1;
7699 int *lookup_buf_end;
7700 const unsigned char *p = src;
7701 int i;
7703 lookup_buf[0] = c;
7704 for (i = 1; i < max_lookup && p < src_end; i++)
7705 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7706 lookup_buf_end = lookup_buf + i;
7707 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7708 if (INTEGERP (trans))
7709 c = XINT (trans);
7710 else if (CONSP (trans))
7712 from_nchars = ASIZE (XCAR (trans));
7713 trans = XCDR (trans);
7714 if (INTEGERP (trans))
7715 c = XINT (trans);
7716 else
7718 to_nchars = ASIZE (trans);
7719 if (buf_end - buf < to_nchars)
7720 break;
7721 c = XINT (AREF (trans, 0));
7724 else
7725 break;
7726 *buf++ = c;
7727 for (i = 1; i < to_nchars; i++)
7728 *buf++ = XINT (AREF (trans, i));
7729 for (i = 1; i < from_nchars; i++, pos++)
7730 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7734 coding->consumed = src - coding->source;
7735 coding->consumed_char = pos - coding->src_pos;
7736 coding->charbuf_used = buf - coding->charbuf;
7737 coding->chars_at_source = 0;
7741 /* Encode the text at CODING->src_object into CODING->dst_object.
7742 CODING->src_object is a buffer or a string.
7743 CODING->dst_object is a buffer or nil.
7745 If CODING->src_object is a buffer, it must be the current buffer.
7746 In this case, if CODING->src_pos is positive, it is a position of
7747 the source text in the buffer, otherwise. the source text is in the
7748 gap area of the buffer, and coding->src_pos specifies the offset of
7749 the text from GPT (which must be the same as PT). If this is the
7750 same buffer as CODING->dst_object, CODING->src_pos must be
7751 negative and CODING should not have `pre-write-conversion'.
7753 If CODING->src_object is a string, CODING should not have
7754 `pre-write-conversion'.
7756 If CODING->dst_object is a buffer, the encoded data is inserted at
7757 the current point of that buffer.
7759 If CODING->dst_object is nil, the encoded data is placed at the
7760 memory area specified by CODING->destination. */
7762 static void
7763 encode_coding (struct coding_system *coding)
7765 Lisp_Object attrs;
7766 Lisp_Object translation_table;
7767 int max_lookup;
7768 struct ccl_spec cclspec;
7770 USE_SAFE_ALLOCA;
7772 attrs = CODING_ID_ATTRS (coding->id);
7773 if (coding->encoder == encode_coding_raw_text)
7774 translation_table = Qnil, max_lookup = 0;
7775 else
7776 translation_table = get_translation_table (attrs, 1, &max_lookup);
7778 if (BUFFERP (coding->dst_object))
7780 set_buffer_internal (XBUFFER (coding->dst_object));
7781 coding->dst_multibyte
7782 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7785 coding->consumed = coding->consumed_char = 0;
7786 coding->produced = coding->produced_char = 0;
7787 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7788 coding->errors = 0;
7790 ALLOC_CONVERSION_WORK_AREA (coding);
7792 if (coding->encoder == encode_coding_ccl)
7794 coding->spec.ccl = &cclspec;
7795 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7797 do {
7798 coding_set_source (coding);
7799 consume_chars (coding, translation_table, max_lookup);
7800 coding_set_destination (coding);
7801 (*(coding->encoder)) (coding);
7802 } while (coding->consumed_char < coding->src_chars);
7804 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7805 insert_from_gap (coding->produced_char, coding->produced, 0);
7807 SAFE_FREE ();
7811 /* Name (or base name) of work buffer for code conversion. */
7812 static Lisp_Object Vcode_conversion_workbuf_name;
7814 /* A working buffer used by the top level conversion. Once it is
7815 created, it is never destroyed. It has the name
7816 Vcode_conversion_workbuf_name. The other working buffers are
7817 destroyed after the use is finished, and their names are modified
7818 versions of Vcode_conversion_workbuf_name. */
7819 static Lisp_Object Vcode_conversion_reused_workbuf;
7821 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7822 static bool reused_workbuf_in_use;
7825 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7826 multibyteness of returning buffer. */
7828 static Lisp_Object
7829 make_conversion_work_buffer (bool multibyte)
7831 Lisp_Object name, workbuf;
7832 struct buffer *current;
7834 if (reused_workbuf_in_use)
7836 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7837 workbuf = Fget_buffer_create (name);
7839 else
7841 reused_workbuf_in_use = 1;
7842 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7843 Vcode_conversion_reused_workbuf
7844 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7845 workbuf = Vcode_conversion_reused_workbuf;
7847 current = current_buffer;
7848 set_buffer_internal (XBUFFER (workbuf));
7849 /* We can't allow modification hooks to run in the work buffer. For
7850 instance, directory_files_internal assumes that file decoding
7851 doesn't compile new regexps. */
7852 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7853 Ferase_buffer ();
7854 bset_undo_list (current_buffer, Qt);
7855 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7856 set_buffer_internal (current);
7857 return workbuf;
7861 static void
7862 code_conversion_restore (Lisp_Object arg)
7864 Lisp_Object current, workbuf;
7865 struct gcpro gcpro1;
7867 GCPRO1 (arg);
7868 current = XCAR (arg);
7869 workbuf = XCDR (arg);
7870 if (! NILP (workbuf))
7872 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7873 reused_workbuf_in_use = 0;
7874 else
7875 Fkill_buffer (workbuf);
7877 set_buffer_internal (XBUFFER (current));
7878 UNGCPRO;
7881 Lisp_Object
7882 code_conversion_save (bool with_work_buf, bool multibyte)
7884 Lisp_Object workbuf = Qnil;
7886 if (with_work_buf)
7887 workbuf = make_conversion_work_buffer (multibyte);
7888 record_unwind_protect (code_conversion_restore,
7889 Fcons (Fcurrent_buffer (), workbuf));
7890 return workbuf;
7893 void
7894 decode_coding_gap (struct coding_system *coding,
7895 ptrdiff_t chars, ptrdiff_t bytes)
7897 ptrdiff_t count = SPECPDL_INDEX ();
7898 Lisp_Object attrs;
7900 coding->src_object = Fcurrent_buffer ();
7901 coding->src_chars = chars;
7902 coding->src_bytes = bytes;
7903 coding->src_pos = -chars;
7904 coding->src_pos_byte = -bytes;
7905 coding->src_multibyte = chars < bytes;
7906 coding->dst_object = coding->src_object;
7907 coding->dst_pos = PT;
7908 coding->dst_pos_byte = PT_BYTE;
7909 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7911 coding->head_ascii = -1;
7912 coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7913 coding->eol_seen = EOL_SEEN_NONE;
7914 if (CODING_REQUIRE_DETECTION (coding))
7915 detect_coding (coding);
7916 attrs = CODING_ID_ATTRS (coding->id);
7917 if (! disable_ascii_optimization
7918 && ! coding->src_multibyte
7919 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7920 && NILP (CODING_ATTR_POST_READ (attrs))
7921 && NILP (get_translation_table (attrs, 0, NULL)))
7923 chars = coding->head_ascii;
7924 if (chars < 0)
7925 chars = check_ascii (coding);
7926 if (chars != bytes)
7928 /* There exists a non-ASCII byte. */
7929 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7930 && coding->detected_utf8_bytes == coding->src_bytes)
7932 if (coding->detected_utf8_chars >= 0)
7933 chars = coding->detected_utf8_chars;
7934 else
7935 chars = check_utf_8 (coding);
7936 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7937 && coding->head_ascii == 0
7938 && coding->source[0] == UTF_8_BOM_1
7939 && coding->source[1] == UTF_8_BOM_2
7940 && coding->source[2] == UTF_8_BOM_3)
7942 chars--;
7943 bytes -= 3;
7944 coding->src_bytes -= 3;
7947 else
7948 chars = -1;
7950 if (chars >= 0)
7952 Lisp_Object eol_type;
7954 eol_type = CODING_ID_EOL_TYPE (coding->id);
7955 if (VECTORP (eol_type))
7957 if (coding->eol_seen != EOL_SEEN_NONE)
7958 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7960 if (EQ (eol_type, Qmac))
7962 unsigned char *src_end = GAP_END_ADDR;
7963 unsigned char *src = src_end - coding->src_bytes;
7965 while (src < src_end)
7967 if (*src++ == '\r')
7968 src[-1] = '\n';
7971 else if (EQ (eol_type, Qdos))
7973 unsigned char *src = GAP_END_ADDR;
7974 unsigned char *src_beg = src - coding->src_bytes;
7975 unsigned char *dst = src;
7976 ptrdiff_t diff;
7978 while (src_beg < src)
7980 *--dst = *--src;
7981 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7982 src--;
7984 diff = dst - src;
7985 bytes -= diff;
7986 chars -= diff;
7988 coding->produced = bytes;
7989 coding->produced_char = chars;
7990 insert_from_gap (chars, bytes, 1);
7991 return;
7994 code_conversion_save (0, 0);
7996 coding->mode |= CODING_MODE_LAST_BLOCK;
7997 current_buffer->text->inhibit_shrinking = 1;
7998 decode_coding (coding);
7999 current_buffer->text->inhibit_shrinking = 0;
8001 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8003 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8004 Lisp_Object val;
8006 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8007 val = call1 (CODING_ATTR_POST_READ (attrs),
8008 make_number (coding->produced_char));
8009 CHECK_NATNUM (val);
8010 coding->produced_char += Z - prev_Z;
8011 coding->produced += Z_BYTE - prev_Z_BYTE;
8014 unbind_to (count, Qnil);
8018 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8019 SRC_OBJECT into DST_OBJECT by coding context CODING.
8021 SRC_OBJECT is a buffer, a string, or Qnil.
8023 If it is a buffer, the text is at point of the buffer. FROM and TO
8024 are positions in the buffer.
8026 If it is a string, the text is at the beginning of the string.
8027 FROM and TO are indices to the string.
8029 If it is nil, the text is at coding->source. FROM and TO are
8030 indices to coding->source.
8032 DST_OBJECT is a buffer, Qt, or Qnil.
8034 If it is a buffer, the decoded text is inserted at point of the
8035 buffer. If the buffer is the same as SRC_OBJECT, the source text
8036 is deleted.
8038 If it is Qt, a string is made from the decoded text, and
8039 set in CODING->dst_object.
8041 If it is Qnil, the decoded text is stored at CODING->destination.
8042 The caller must allocate CODING->dst_bytes bytes at
8043 CODING->destination by xmalloc. If the decoded text is longer than
8044 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8047 void
8048 decode_coding_object (struct coding_system *coding,
8049 Lisp_Object src_object,
8050 ptrdiff_t from, ptrdiff_t from_byte,
8051 ptrdiff_t to, ptrdiff_t to_byte,
8052 Lisp_Object dst_object)
8054 ptrdiff_t count = SPECPDL_INDEX ();
8055 unsigned char *destination IF_LINT (= NULL);
8056 ptrdiff_t dst_bytes IF_LINT (= 0);
8057 ptrdiff_t chars = to - from;
8058 ptrdiff_t bytes = to_byte - from_byte;
8059 Lisp_Object attrs;
8060 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8061 bool need_marker_adjustment = 0;
8062 Lisp_Object old_deactivate_mark;
8064 old_deactivate_mark = Vdeactivate_mark;
8066 if (NILP (dst_object))
8068 destination = coding->destination;
8069 dst_bytes = coding->dst_bytes;
8072 coding->src_object = src_object;
8073 coding->src_chars = chars;
8074 coding->src_bytes = bytes;
8075 coding->src_multibyte = chars < bytes;
8077 if (STRINGP (src_object))
8079 coding->src_pos = from;
8080 coding->src_pos_byte = from_byte;
8082 else if (BUFFERP (src_object))
8084 set_buffer_internal (XBUFFER (src_object));
8085 if (from != GPT)
8086 move_gap_both (from, from_byte);
8087 if (EQ (src_object, dst_object))
8089 struct Lisp_Marker *tail;
8091 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8093 tail->need_adjustment
8094 = tail->charpos == (tail->insertion_type ? from : to);
8095 need_marker_adjustment |= tail->need_adjustment;
8097 saved_pt = PT, saved_pt_byte = PT_BYTE;
8098 TEMP_SET_PT_BOTH (from, from_byte);
8099 current_buffer->text->inhibit_shrinking = 1;
8100 del_range_both (from, from_byte, to, to_byte, 1);
8101 coding->src_pos = -chars;
8102 coding->src_pos_byte = -bytes;
8104 else
8106 coding->src_pos = from;
8107 coding->src_pos_byte = from_byte;
8111 if (CODING_REQUIRE_DETECTION (coding))
8112 detect_coding (coding);
8113 attrs = CODING_ID_ATTRS (coding->id);
8115 if (EQ (dst_object, Qt)
8116 || (! NILP (CODING_ATTR_POST_READ (attrs))
8117 && NILP (dst_object)))
8119 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8120 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8121 coding->dst_pos = BEG;
8122 coding->dst_pos_byte = BEG_BYTE;
8124 else if (BUFFERP (dst_object))
8126 code_conversion_save (0, 0);
8127 coding->dst_object = dst_object;
8128 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8129 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8130 coding->dst_multibyte
8131 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8133 else
8135 code_conversion_save (0, 0);
8136 coding->dst_object = Qnil;
8137 /* Most callers presume this will return a multibyte result, and they
8138 won't use `binary' or `raw-text' anyway, so let's not worry about
8139 CODING_FOR_UNIBYTE. */
8140 coding->dst_multibyte = 1;
8143 decode_coding (coding);
8145 if (BUFFERP (coding->dst_object))
8146 set_buffer_internal (XBUFFER (coding->dst_object));
8148 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8150 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8151 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8152 Lisp_Object val;
8154 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8155 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8156 old_deactivate_mark);
8157 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8158 make_number (coding->produced_char));
8159 UNGCPRO;
8160 CHECK_NATNUM (val);
8161 coding->produced_char += Z - prev_Z;
8162 coding->produced += Z_BYTE - prev_Z_BYTE;
8165 if (EQ (dst_object, Qt))
8167 coding->dst_object = Fbuffer_string ();
8169 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8171 set_buffer_internal (XBUFFER (coding->dst_object));
8172 if (dst_bytes < coding->produced)
8174 eassert (coding->produced > 0);
8175 destination = xrealloc (destination, coding->produced);
8176 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8177 move_gap_both (BEGV, BEGV_BYTE);
8178 memcpy (destination, BEGV_ADDR, coding->produced);
8179 coding->destination = destination;
8183 if (saved_pt >= 0)
8185 /* This is the case of:
8186 (BUFFERP (src_object) && EQ (src_object, dst_object))
8187 As we have moved PT while replacing the original buffer
8188 contents, we must recover it now. */
8189 set_buffer_internal (XBUFFER (src_object));
8190 current_buffer->text->inhibit_shrinking = 0;
8191 if (saved_pt < from)
8192 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8193 else if (saved_pt < from + chars)
8194 TEMP_SET_PT_BOTH (from, from_byte);
8195 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8196 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8197 saved_pt_byte + (coding->produced - bytes));
8198 else
8199 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8200 saved_pt_byte + (coding->produced - bytes));
8202 if (need_marker_adjustment)
8204 struct Lisp_Marker *tail;
8206 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8207 if (tail->need_adjustment)
8209 tail->need_adjustment = 0;
8210 if (tail->insertion_type)
8212 tail->bytepos = from_byte;
8213 tail->charpos = from;
8215 else
8217 tail->bytepos = from_byte + coding->produced;
8218 tail->charpos
8219 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8220 ? tail->bytepos : from + coding->produced_char);
8226 Vdeactivate_mark = old_deactivate_mark;
8227 unbind_to (count, coding->dst_object);
8231 void
8232 encode_coding_object (struct coding_system *coding,
8233 Lisp_Object src_object,
8234 ptrdiff_t from, ptrdiff_t from_byte,
8235 ptrdiff_t to, ptrdiff_t to_byte,
8236 Lisp_Object dst_object)
8238 ptrdiff_t count = SPECPDL_INDEX ();
8239 ptrdiff_t chars = to - from;
8240 ptrdiff_t bytes = to_byte - from_byte;
8241 Lisp_Object attrs;
8242 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8243 bool need_marker_adjustment = 0;
8244 bool kill_src_buffer = 0;
8245 Lisp_Object old_deactivate_mark;
8247 old_deactivate_mark = Vdeactivate_mark;
8249 coding->src_object = src_object;
8250 coding->src_chars = chars;
8251 coding->src_bytes = bytes;
8252 coding->src_multibyte = chars < bytes;
8254 attrs = CODING_ID_ATTRS (coding->id);
8256 if (EQ (src_object, dst_object))
8258 struct Lisp_Marker *tail;
8260 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8262 tail->need_adjustment
8263 = tail->charpos == (tail->insertion_type ? from : to);
8264 need_marker_adjustment |= tail->need_adjustment;
8268 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8270 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8271 set_buffer_internal (XBUFFER (coding->src_object));
8272 if (STRINGP (src_object))
8273 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8274 else if (BUFFERP (src_object))
8275 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8276 else
8277 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8279 if (EQ (src_object, dst_object))
8281 set_buffer_internal (XBUFFER (src_object));
8282 saved_pt = PT, saved_pt_byte = PT_BYTE;
8283 del_range_both (from, from_byte, to, to_byte, 1);
8284 set_buffer_internal (XBUFFER (coding->src_object));
8288 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8290 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8291 old_deactivate_mark);
8292 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8293 make_number (BEG), make_number (Z));
8294 UNGCPRO;
8296 if (XBUFFER (coding->src_object) != current_buffer)
8297 kill_src_buffer = 1;
8298 coding->src_object = Fcurrent_buffer ();
8299 if (BEG != GPT)
8300 move_gap_both (BEG, BEG_BYTE);
8301 coding->src_chars = Z - BEG;
8302 coding->src_bytes = Z_BYTE - BEG_BYTE;
8303 coding->src_pos = BEG;
8304 coding->src_pos_byte = BEG_BYTE;
8305 coding->src_multibyte = Z < Z_BYTE;
8307 else if (STRINGP (src_object))
8309 code_conversion_save (0, 0);
8310 coding->src_pos = from;
8311 coding->src_pos_byte = from_byte;
8313 else if (BUFFERP (src_object))
8315 code_conversion_save (0, 0);
8316 set_buffer_internal (XBUFFER (src_object));
8317 if (EQ (src_object, dst_object))
8319 saved_pt = PT, saved_pt_byte = PT_BYTE;
8320 coding->src_object = del_range_1 (from, to, 1, 1);
8321 coding->src_pos = 0;
8322 coding->src_pos_byte = 0;
8324 else
8326 if (from < GPT && to >= GPT)
8327 move_gap_both (from, from_byte);
8328 coding->src_pos = from;
8329 coding->src_pos_byte = from_byte;
8332 else
8333 code_conversion_save (0, 0);
8335 if (BUFFERP (dst_object))
8337 coding->dst_object = dst_object;
8338 if (EQ (src_object, dst_object))
8340 coding->dst_pos = from;
8341 coding->dst_pos_byte = from_byte;
8343 else
8345 struct buffer *current = current_buffer;
8347 set_buffer_temp (XBUFFER (dst_object));
8348 coding->dst_pos = PT;
8349 coding->dst_pos_byte = PT_BYTE;
8350 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8351 set_buffer_temp (current);
8353 coding->dst_multibyte
8354 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8356 else if (EQ (dst_object, Qt))
8358 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8359 coding->dst_object = Qnil;
8360 coding->destination = xmalloc (dst_bytes);
8361 coding->dst_bytes = dst_bytes;
8362 coding->dst_multibyte = 0;
8364 else
8366 coding->dst_object = Qnil;
8367 coding->dst_multibyte = 0;
8370 encode_coding (coding);
8372 if (EQ (dst_object, Qt))
8374 if (BUFFERP (coding->dst_object))
8375 coding->dst_object = Fbuffer_string ();
8376 else if (coding->raw_destination)
8377 /* This is used to avoid creating huge Lisp string.
8378 NOTE: caller who sets `raw_destination' is also
8379 responsible for freeing `destination' buffer. */
8380 coding->dst_object = Qnil;
8381 else
8383 coding->dst_object
8384 = make_unibyte_string ((char *) coding->destination,
8385 coding->produced);
8386 xfree (coding->destination);
8390 if (saved_pt >= 0)
8392 /* This is the case of:
8393 (BUFFERP (src_object) && EQ (src_object, dst_object))
8394 As we have moved PT while replacing the original buffer
8395 contents, we must recover it now. */
8396 set_buffer_internal (XBUFFER (src_object));
8397 if (saved_pt < from)
8398 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8399 else if (saved_pt < from + chars)
8400 TEMP_SET_PT_BOTH (from, from_byte);
8401 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8402 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8403 saved_pt_byte + (coding->produced - bytes));
8404 else
8405 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8406 saved_pt_byte + (coding->produced - bytes));
8408 if (need_marker_adjustment)
8410 struct Lisp_Marker *tail;
8412 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8413 if (tail->need_adjustment)
8415 tail->need_adjustment = 0;
8416 if (tail->insertion_type)
8418 tail->bytepos = from_byte;
8419 tail->charpos = from;
8421 else
8423 tail->bytepos = from_byte + coding->produced;
8424 tail->charpos
8425 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8426 ? tail->bytepos : from + coding->produced_char);
8432 if (kill_src_buffer)
8433 Fkill_buffer (coding->src_object);
8435 Vdeactivate_mark = old_deactivate_mark;
8436 unbind_to (count, Qnil);
8440 Lisp_Object
8441 preferred_coding_system (void)
8443 int id = coding_categories[coding_priorities[0]].id;
8445 return CODING_ID_NAME (id);
8448 #if defined (WINDOWSNT) || defined (CYGWIN)
8450 Lisp_Object
8451 from_unicode (Lisp_Object str)
8453 CHECK_STRING (str);
8454 if (!STRING_MULTIBYTE (str) &&
8455 SBYTES (str) & 1)
8457 str = Fsubstring (str, make_number (0), make_number (-1));
8460 return code_convert_string_norecord (str, Qutf_16le, 0);
8463 Lisp_Object
8464 from_unicode_buffer (const wchar_t* wstr)
8466 return from_unicode (
8467 make_unibyte_string (
8468 (char*) wstr,
8469 /* we get one of the two final 0 bytes for free. */
8470 1 + sizeof (wchar_t) * wcslen (wstr)));
8473 wchar_t *
8474 to_unicode (Lisp_Object str, Lisp_Object *buf)
8476 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8477 /* We need to make another copy (in addition to the one made by
8478 code_convert_string_norecord) to ensure that the final string is
8479 _doubly_ zero terminated --- that is, that the string is
8480 terminated by two zero bytes and one utf-16le null character.
8481 Because strings are already terminated with a single zero byte,
8482 we just add one additional zero. */
8483 str = make_uninit_string (SBYTES (*buf) + 1);
8484 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8485 SDATA (str) [SBYTES (*buf)] = '\0';
8486 *buf = str;
8487 return WCSDATA (*buf);
8490 #endif /* WINDOWSNT || CYGWIN */
8493 #ifdef emacs
8494 /*** 8. Emacs Lisp library functions ***/
8496 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8497 doc: /* Return t if OBJECT is nil or a coding-system.
8498 See the documentation of `define-coding-system' for information
8499 about coding-system objects. */)
8500 (Lisp_Object object)
8502 if (NILP (object)
8503 || CODING_SYSTEM_ID (object) >= 0)
8504 return Qt;
8505 if (! SYMBOLP (object)
8506 || NILP (Fget (object, Qcoding_system_define_form)))
8507 return Qnil;
8508 return Qt;
8511 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8512 Sread_non_nil_coding_system, 1, 1, 0,
8513 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8514 (Lisp_Object prompt)
8516 Lisp_Object val;
8519 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8520 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8522 while (SCHARS (val) == 0);
8523 return (Fintern (val, Qnil));
8526 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8527 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8528 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8529 Ignores case when completing coding systems (all Emacs coding systems
8530 are lower-case). */)
8531 (Lisp_Object prompt, Lisp_Object default_coding_system)
8533 Lisp_Object val;
8534 ptrdiff_t count = SPECPDL_INDEX ();
8536 if (SYMBOLP (default_coding_system))
8537 default_coding_system = SYMBOL_NAME (default_coding_system);
8538 specbind (Qcompletion_ignore_case, Qt);
8539 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8540 Qt, Qnil, Qcoding_system_history,
8541 default_coding_system, Qnil);
8542 unbind_to (count, Qnil);
8543 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8546 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8547 1, 1, 0,
8548 doc: /* Check validity of CODING-SYSTEM.
8549 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8550 It is valid if it is nil or a symbol defined as a coding system by the
8551 function `define-coding-system'. */)
8552 (Lisp_Object coding_system)
8554 Lisp_Object define_form;
8556 define_form = Fget (coding_system, Qcoding_system_define_form);
8557 if (! NILP (define_form))
8559 Fput (coding_system, Qcoding_system_define_form, Qnil);
8560 safe_eval (define_form);
8562 if (!NILP (Fcoding_system_p (coding_system)))
8563 return coding_system;
8564 xsignal1 (Qcoding_system_error, coding_system);
8568 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8569 HIGHEST, return the coding system of the highest
8570 priority among the detected coding systems. Otherwise return a
8571 list of detected coding systems sorted by their priorities. If
8572 MULTIBYTEP, it is assumed that the bytes are in correct
8573 multibyte form but contains only ASCII and eight-bit chars.
8574 Otherwise, the bytes are raw bytes.
8576 CODING-SYSTEM controls the detection as below:
8578 If it is nil, detect both text-format and eol-format. If the
8579 text-format part of CODING-SYSTEM is already specified
8580 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8581 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8582 detect only text-format. */
8584 Lisp_Object
8585 detect_coding_system (const unsigned char *src,
8586 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8587 bool highest, bool multibytep,
8588 Lisp_Object coding_system)
8590 const unsigned char *src_end = src + src_bytes;
8591 Lisp_Object attrs, eol_type;
8592 Lisp_Object val = Qnil;
8593 struct coding_system coding;
8594 ptrdiff_t id;
8595 struct coding_detection_info detect_info;
8596 enum coding_category base_category;
8597 bool null_byte_found = 0, eight_bit_found = 0;
8599 if (NILP (coding_system))
8600 coding_system = Qundecided;
8601 setup_coding_system (coding_system, &coding);
8602 attrs = CODING_ID_ATTRS (coding.id);
8603 eol_type = CODING_ID_EOL_TYPE (coding.id);
8604 coding_system = CODING_ATTR_BASE_NAME (attrs);
8606 coding.source = src;
8607 coding.src_chars = src_chars;
8608 coding.src_bytes = src_bytes;
8609 coding.src_multibyte = multibytep;
8610 coding.consumed = 0;
8611 coding.mode |= CODING_MODE_LAST_BLOCK;
8612 coding.head_ascii = 0;
8614 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8616 /* At first, detect text-format if necessary. */
8617 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8618 if (base_category == coding_category_undecided)
8620 enum coding_category category IF_LINT (= 0);
8621 struct coding_system *this IF_LINT (= NULL);
8622 int c, i;
8623 bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8624 inhibit_null_byte_detection);
8625 bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8626 inhibit_iso_escape_detection);
8627 bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8629 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8630 for (; src < src_end; src++)
8632 c = *src;
8633 if (c & 0x80)
8635 eight_bit_found = 1;
8636 if (null_byte_found)
8637 break;
8639 else if (c < 0x20)
8641 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8642 && ! inhibit_ied
8643 && ! detect_info.checked)
8645 if (detect_coding_iso_2022 (&coding, &detect_info))
8647 /* We have scanned the whole data. */
8648 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8650 /* We didn't find an 8-bit code. We may
8651 have found a null-byte, but it's very
8652 rare that a binary file confirm to
8653 ISO-2022. */
8654 src = src_end;
8655 coding.head_ascii = src - coding.source;
8657 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8658 break;
8661 else if (! c && !inhibit_nbd)
8663 null_byte_found = 1;
8664 if (eight_bit_found)
8665 break;
8667 if (! eight_bit_found)
8668 coding.head_ascii++;
8670 else if (! eight_bit_found)
8671 coding.head_ascii++;
8674 if (null_byte_found || eight_bit_found
8675 || coding.head_ascii < coding.src_bytes
8676 || detect_info.found)
8678 if (coding.head_ascii == coding.src_bytes)
8679 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8680 for (i = 0; i < coding_category_raw_text; i++)
8682 category = coding_priorities[i];
8683 this = coding_categories + category;
8684 if (detect_info.found & (1 << category))
8685 break;
8687 else
8689 if (null_byte_found)
8691 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8692 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8694 else if (prefer_utf_8
8695 && detect_coding_utf_8 (&coding, &detect_info))
8697 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8698 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8700 for (i = 0; i < coding_category_raw_text; i++)
8702 category = coding_priorities[i];
8703 this = coding_categories + category;
8705 if (this->id < 0)
8707 /* No coding system of this category is defined. */
8708 detect_info.rejected |= (1 << category);
8710 else if (category >= coding_category_raw_text)
8711 continue;
8712 else if (detect_info.checked & (1 << category))
8714 if (highest
8715 && (detect_info.found & (1 << category)))
8716 break;
8718 else if ((*(this->detector)) (&coding, &detect_info)
8719 && highest
8720 && (detect_info.found & (1 << category)))
8722 if (category == coding_category_utf_16_auto)
8724 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8725 category = coding_category_utf_16_le;
8726 else
8727 category = coding_category_utf_16_be;
8729 break;
8735 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8736 || null_byte_found)
8738 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8739 id = CODING_SYSTEM_ID (Qno_conversion);
8740 val = list1 (make_number (id));
8742 else if (! detect_info.rejected && ! detect_info.found)
8744 detect_info.found = CATEGORY_MASK_ANY;
8745 id = coding_categories[coding_category_undecided].id;
8746 val = list1 (make_number (id));
8748 else if (highest)
8750 if (detect_info.found)
8752 detect_info.found = 1 << category;
8753 val = list1 (make_number (this->id));
8755 else
8756 for (i = 0; i < coding_category_raw_text; i++)
8757 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8759 detect_info.found = 1 << coding_priorities[i];
8760 id = coding_categories[coding_priorities[i]].id;
8761 val = list1 (make_number (id));
8762 break;
8765 else
8767 int mask = detect_info.rejected | detect_info.found;
8768 int found = 0;
8770 for (i = coding_category_raw_text - 1; i >= 0; i--)
8772 category = coding_priorities[i];
8773 if (! (mask & (1 << category)))
8775 found |= 1 << category;
8776 id = coding_categories[category].id;
8777 if (id >= 0)
8778 val = list1 (make_number (id));
8781 for (i = coding_category_raw_text - 1; i >= 0; i--)
8783 category = coding_priorities[i];
8784 if (detect_info.found & (1 << category))
8786 id = coding_categories[category].id;
8787 val = Fcons (make_number (id), val);
8790 detect_info.found |= found;
8793 else if (base_category == coding_category_utf_8_auto)
8795 if (detect_coding_utf_8 (&coding, &detect_info))
8797 struct coding_system *this;
8799 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8800 this = coding_categories + coding_category_utf_8_sig;
8801 else
8802 this = coding_categories + coding_category_utf_8_nosig;
8803 val = list1 (make_number (this->id));
8806 else if (base_category == coding_category_utf_16_auto)
8808 if (detect_coding_utf_16 (&coding, &detect_info))
8810 struct coding_system *this;
8812 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8813 this = coding_categories + coding_category_utf_16_le;
8814 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8815 this = coding_categories + coding_category_utf_16_be;
8816 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8817 this = coding_categories + coding_category_utf_16_be_nosig;
8818 else
8819 this = coding_categories + coding_category_utf_16_le_nosig;
8820 val = list1 (make_number (this->id));
8823 else
8825 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8826 val = list1 (make_number (coding.id));
8829 /* Then, detect eol-format if necessary. */
8831 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8832 Lisp_Object tail;
8834 if (VECTORP (eol_type))
8836 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8838 if (null_byte_found)
8839 normal_eol = EOL_SEEN_LF;
8840 else
8841 normal_eol = detect_eol (coding.source, src_bytes,
8842 coding_category_raw_text);
8844 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8845 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8846 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8847 coding_category_utf_16_be);
8848 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8849 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8850 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8851 coding_category_utf_16_le);
8853 else
8855 if (EQ (eol_type, Qunix))
8856 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8857 else if (EQ (eol_type, Qdos))
8858 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8859 else
8860 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8863 for (tail = val; CONSP (tail); tail = XCDR (tail))
8865 enum coding_category category;
8866 int this_eol;
8868 id = XINT (XCAR (tail));
8869 attrs = CODING_ID_ATTRS (id);
8870 category = XINT (CODING_ATTR_CATEGORY (attrs));
8871 eol_type = CODING_ID_EOL_TYPE (id);
8872 if (VECTORP (eol_type))
8874 if (category == coding_category_utf_16_be
8875 || category == coding_category_utf_16_be_nosig)
8876 this_eol = utf_16_be_eol;
8877 else if (category == coding_category_utf_16_le
8878 || category == coding_category_utf_16_le_nosig)
8879 this_eol = utf_16_le_eol;
8880 else
8881 this_eol = normal_eol;
8883 if (this_eol == EOL_SEEN_LF)
8884 XSETCAR (tail, AREF (eol_type, 0));
8885 else if (this_eol == EOL_SEEN_CRLF)
8886 XSETCAR (tail, AREF (eol_type, 1));
8887 else if (this_eol == EOL_SEEN_CR)
8888 XSETCAR (tail, AREF (eol_type, 2));
8889 else
8890 XSETCAR (tail, CODING_ID_NAME (id));
8892 else
8893 XSETCAR (tail, CODING_ID_NAME (id));
8897 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8901 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8902 2, 3, 0,
8903 doc: /* Detect coding system of the text in the region between START and END.
8904 Return a list of possible coding systems ordered by priority.
8905 The coding systems to try and their priorities follows what
8906 the function `coding-system-priority-list' (which see) returns.
8908 If only ASCII characters are found (except for such ISO-2022 control
8909 characters as ESC), it returns a list of single element `undecided'
8910 or its subsidiary coding system according to a detected end-of-line
8911 format.
8913 If optional argument HIGHEST is non-nil, return the coding system of
8914 highest priority. */)
8915 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8917 ptrdiff_t from, to;
8918 ptrdiff_t from_byte, to_byte;
8920 validate_region (&start, &end);
8921 from = XINT (start), to = XINT (end);
8922 from_byte = CHAR_TO_BYTE (from);
8923 to_byte = CHAR_TO_BYTE (to);
8925 if (from < GPT && to >= GPT)
8926 move_gap_both (to, to_byte);
8928 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8929 to - from, to_byte - from_byte,
8930 !NILP (highest),
8931 !NILP (BVAR (current_buffer
8932 , enable_multibyte_characters)),
8933 Qnil);
8936 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8937 1, 2, 0,
8938 doc: /* Detect coding system of the text in STRING.
8939 Return a list of possible coding systems ordered by priority.
8940 The coding systems to try and their priorities follows what
8941 the function `coding-system-priority-list' (which see) returns.
8943 If only ASCII characters are found (except for such ISO-2022 control
8944 characters as ESC), it returns a list of single element `undecided'
8945 or its subsidiary coding system according to a detected end-of-line
8946 format.
8948 If optional argument HIGHEST is non-nil, return the coding system of
8949 highest priority. */)
8950 (Lisp_Object string, Lisp_Object highest)
8952 CHECK_STRING (string);
8954 return detect_coding_system (SDATA (string),
8955 SCHARS (string), SBYTES (string),
8956 !NILP (highest), STRING_MULTIBYTE (string),
8957 Qnil);
8961 static bool
8962 char_encodable_p (int c, Lisp_Object attrs)
8964 Lisp_Object tail;
8965 struct charset *charset;
8966 Lisp_Object translation_table;
8968 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8969 if (! NILP (translation_table))
8970 c = translate_char (translation_table, c);
8971 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8972 CONSP (tail); tail = XCDR (tail))
8974 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8975 if (CHAR_CHARSET_P (c, charset))
8976 break;
8978 return (! NILP (tail));
8982 /* Return a list of coding systems that safely encode the text between
8983 START and END. If EXCLUDE is non-nil, it is a list of coding
8984 systems not to check. The returned list doesn't contain any such
8985 coding systems. In any case, if the text contains only ASCII or is
8986 unibyte, return t. */
8988 DEFUN ("find-coding-systems-region-internal",
8989 Ffind_coding_systems_region_internal,
8990 Sfind_coding_systems_region_internal, 2, 3, 0,
8991 doc: /* Internal use only. */)
8992 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8994 Lisp_Object coding_attrs_list, safe_codings;
8995 ptrdiff_t start_byte, end_byte;
8996 const unsigned char *p, *pbeg, *pend;
8997 int c;
8998 Lisp_Object tail, elt, work_table;
9000 if (STRINGP (start))
9002 if (!STRING_MULTIBYTE (start)
9003 || SCHARS (start) == SBYTES (start))
9004 return Qt;
9005 start_byte = 0;
9006 end_byte = SBYTES (start);
9008 else
9010 CHECK_NUMBER_COERCE_MARKER (start);
9011 CHECK_NUMBER_COERCE_MARKER (end);
9012 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9013 args_out_of_range (start, end);
9014 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9015 return Qt;
9016 start_byte = CHAR_TO_BYTE (XINT (start));
9017 end_byte = CHAR_TO_BYTE (XINT (end));
9018 if (XINT (end) - XINT (start) == end_byte - start_byte)
9019 return Qt;
9021 if (XINT (start) < GPT && XINT (end) > GPT)
9023 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9024 move_gap_both (XINT (start), start_byte);
9025 else
9026 move_gap_both (XINT (end), end_byte);
9030 coding_attrs_list = Qnil;
9031 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9032 if (NILP (exclude)
9033 || NILP (Fmemq (XCAR (tail), exclude)))
9035 Lisp_Object attrs;
9037 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9038 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9040 ASET (attrs, coding_attr_trans_tbl,
9041 get_translation_table (attrs, 1, NULL));
9042 coding_attrs_list = Fcons (attrs, coding_attrs_list);
9046 if (STRINGP (start))
9047 p = pbeg = SDATA (start);
9048 else
9049 p = pbeg = BYTE_POS_ADDR (start_byte);
9050 pend = p + (end_byte - start_byte);
9052 while (p < pend && ASCII_BYTE_P (*p)) p++;
9053 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9055 work_table = Fmake_char_table (Qnil, Qnil);
9056 while (p < pend)
9058 if (ASCII_BYTE_P (*p))
9059 p++;
9060 else
9062 c = STRING_CHAR_ADVANCE (p);
9063 if (!NILP (char_table_ref (work_table, c)))
9064 /* This character was already checked. Ignore it. */
9065 continue;
9067 charset_map_loaded = 0;
9068 for (tail = coding_attrs_list; CONSP (tail);)
9070 elt = XCAR (tail);
9071 if (NILP (elt))
9072 tail = XCDR (tail);
9073 else if (char_encodable_p (c, elt))
9074 tail = XCDR (tail);
9075 else if (CONSP (XCDR (tail)))
9077 XSETCAR (tail, XCAR (XCDR (tail)));
9078 XSETCDR (tail, XCDR (XCDR (tail)));
9080 else
9082 XSETCAR (tail, Qnil);
9083 tail = XCDR (tail);
9086 if (charset_map_loaded)
9088 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9090 if (STRINGP (start))
9091 pbeg = SDATA (start);
9092 else
9093 pbeg = BYTE_POS_ADDR (start_byte);
9094 p = pbeg + p_offset;
9095 pend = pbeg + pend_offset;
9097 char_table_set (work_table, c, Qt);
9101 safe_codings = list2 (Qraw_text, Qno_conversion);
9102 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9103 if (! NILP (XCAR (tail)))
9104 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9106 return safe_codings;
9110 DEFUN ("unencodable-char-position", Funencodable_char_position,
9111 Sunencodable_char_position, 3, 5, 0,
9112 doc: /*
9113 Return position of first un-encodable character in a region.
9114 START and END specify the region and CODING-SYSTEM specifies the
9115 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9117 If optional 4th argument COUNT is non-nil, it specifies at most how
9118 many un-encodable characters to search. In this case, the value is a
9119 list of positions.
9121 If optional 5th argument STRING is non-nil, it is a string to search
9122 for un-encodable characters. In that case, START and END are indexes
9123 to the string. */)
9124 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9126 EMACS_INT n;
9127 struct coding_system coding;
9128 Lisp_Object attrs, charset_list, translation_table;
9129 Lisp_Object positions;
9130 ptrdiff_t from, to;
9131 const unsigned char *p, *stop, *pend;
9132 bool ascii_compatible;
9134 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9135 attrs = CODING_ID_ATTRS (coding.id);
9136 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9137 return Qnil;
9138 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9139 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9140 translation_table = get_translation_table (attrs, 1, NULL);
9142 if (NILP (string))
9144 validate_region (&start, &end);
9145 from = XINT (start);
9146 to = XINT (end);
9147 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9148 || (ascii_compatible
9149 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9150 return Qnil;
9151 p = CHAR_POS_ADDR (from);
9152 pend = CHAR_POS_ADDR (to);
9153 if (from < GPT && to >= GPT)
9154 stop = GPT_ADDR;
9155 else
9156 stop = pend;
9158 else
9160 CHECK_STRING (string);
9161 CHECK_NATNUM (start);
9162 CHECK_NATNUM (end);
9163 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9164 args_out_of_range_3 (string, start, end);
9165 from = XINT (start);
9166 to = XINT (end);
9167 if (! STRING_MULTIBYTE (string))
9168 return Qnil;
9169 p = SDATA (string) + string_char_to_byte (string, from);
9170 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9171 if (ascii_compatible && (to - from) == (pend - p))
9172 return Qnil;
9175 if (NILP (count))
9176 n = 1;
9177 else
9179 CHECK_NATNUM (count);
9180 n = XINT (count);
9183 positions = Qnil;
9184 charset_map_loaded = 0;
9185 while (1)
9187 int c;
9189 if (ascii_compatible)
9190 while (p < stop && ASCII_BYTE_P (*p))
9191 p++, from++;
9192 if (p >= stop)
9194 if (p >= pend)
9195 break;
9196 stop = pend;
9197 p = GAP_END_ADDR;
9200 c = STRING_CHAR_ADVANCE (p);
9201 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9202 && ! char_charset (translate_char (translation_table, c),
9203 charset_list, NULL))
9205 positions = Fcons (make_number (from), positions);
9206 n--;
9207 if (n == 0)
9208 break;
9211 from++;
9212 if (charset_map_loaded && NILP (string))
9214 p = CHAR_POS_ADDR (from);
9215 pend = CHAR_POS_ADDR (to);
9216 if (from < GPT && to >= GPT)
9217 stop = GPT_ADDR;
9218 else
9219 stop = pend;
9220 charset_map_loaded = 0;
9224 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9228 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9229 Scheck_coding_systems_region, 3, 3, 0,
9230 doc: /* Check if the region is encodable by coding systems.
9232 START and END are buffer positions specifying the region.
9233 CODING-SYSTEM-LIST is a list of coding systems to check.
9235 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9236 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9237 whole region, POS0, POS1, ... are buffer positions where non-encodable
9238 characters are found.
9240 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9241 value is nil.
9243 START may be a string. In that case, check if the string is
9244 encodable, and the value contains indices to the string instead of
9245 buffer positions. END is ignored.
9247 If the current buffer (or START if it is a string) is unibyte, the value
9248 is nil. */)
9249 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9251 Lisp_Object list;
9252 ptrdiff_t start_byte, end_byte;
9253 ptrdiff_t pos;
9254 const unsigned char *p, *pbeg, *pend;
9255 int c;
9256 Lisp_Object tail, elt, attrs;
9258 if (STRINGP (start))
9260 if (!STRING_MULTIBYTE (start)
9261 || SCHARS (start) == SBYTES (start))
9262 return Qnil;
9263 start_byte = 0;
9264 end_byte = SBYTES (start);
9265 pos = 0;
9267 else
9269 CHECK_NUMBER_COERCE_MARKER (start);
9270 CHECK_NUMBER_COERCE_MARKER (end);
9271 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9272 args_out_of_range (start, end);
9273 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9274 return Qnil;
9275 start_byte = CHAR_TO_BYTE (XINT (start));
9276 end_byte = CHAR_TO_BYTE (XINT (end));
9277 if (XINT (end) - XINT (start) == end_byte - start_byte)
9278 return Qnil;
9280 if (XINT (start) < GPT && XINT (end) > GPT)
9282 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9283 move_gap_both (XINT (start), start_byte);
9284 else
9285 move_gap_both (XINT (end), end_byte);
9287 pos = XINT (start);
9290 list = Qnil;
9291 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9293 elt = XCAR (tail);
9294 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9295 ASET (attrs, coding_attr_trans_tbl,
9296 get_translation_table (attrs, 1, NULL));
9297 list = Fcons (list2 (elt, attrs), list);
9300 if (STRINGP (start))
9301 p = pbeg = SDATA (start);
9302 else
9303 p = pbeg = BYTE_POS_ADDR (start_byte);
9304 pend = p + (end_byte - start_byte);
9306 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9307 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9309 while (p < pend)
9311 if (ASCII_BYTE_P (*p))
9312 p++;
9313 else
9315 c = STRING_CHAR_ADVANCE (p);
9317 charset_map_loaded = 0;
9318 for (tail = list; CONSP (tail); tail = XCDR (tail))
9320 elt = XCDR (XCAR (tail));
9321 if (! char_encodable_p (c, XCAR (elt)))
9322 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9324 if (charset_map_loaded)
9326 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9328 if (STRINGP (start))
9329 pbeg = SDATA (start);
9330 else
9331 pbeg = BYTE_POS_ADDR (start_byte);
9332 p = pbeg + p_offset;
9333 pend = pbeg + pend_offset;
9336 pos++;
9339 tail = list;
9340 list = Qnil;
9341 for (; CONSP (tail); tail = XCDR (tail))
9343 elt = XCAR (tail);
9344 if (CONSP (XCDR (XCDR (elt))))
9345 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9346 list);
9349 return list;
9353 static Lisp_Object
9354 code_convert_region (Lisp_Object start, Lisp_Object end,
9355 Lisp_Object coding_system, Lisp_Object dst_object,
9356 bool encodep, bool norecord)
9358 struct coding_system coding;
9359 ptrdiff_t from, from_byte, to, to_byte;
9360 Lisp_Object src_object;
9362 if (NILP (coding_system))
9363 coding_system = Qno_conversion;
9364 else
9365 CHECK_CODING_SYSTEM (coding_system);
9366 src_object = Fcurrent_buffer ();
9367 if (NILP (dst_object))
9368 dst_object = src_object;
9369 else if (! EQ (dst_object, Qt))
9370 CHECK_BUFFER (dst_object);
9372 validate_region (&start, &end);
9373 from = XFASTINT (start);
9374 from_byte = CHAR_TO_BYTE (from);
9375 to = XFASTINT (end);
9376 to_byte = CHAR_TO_BYTE (to);
9378 setup_coding_system (coding_system, &coding);
9379 coding.mode |= CODING_MODE_LAST_BLOCK;
9381 if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9383 struct buffer *buf = XBUFFER (dst_object);
9384 ptrdiff_t buf_pt = BUF_PT (buf);
9386 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9389 if (encodep)
9390 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9391 dst_object);
9392 else
9393 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9394 dst_object);
9395 if (! norecord)
9396 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9398 return (BUFFERP (dst_object)
9399 ? make_number (coding.produced_char)
9400 : coding.dst_object);
9404 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9405 3, 4, "r\nzCoding system: ",
9406 doc: /* Decode the current region from the specified coding system.
9407 When called from a program, takes four arguments:
9408 START, END, CODING-SYSTEM, and DESTINATION.
9409 START and END are buffer positions.
9411 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9412 If nil, the region between START and END is replaced by the decoded text.
9413 If buffer, the decoded text is inserted in that buffer after point (point
9414 does not move).
9415 In those cases, the length of the decoded text is returned.
9416 If DESTINATION is t, the decoded text is returned.
9418 This function sets `last-coding-system-used' to the precise coding system
9419 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9420 not fully specified.) */)
9421 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9423 return code_convert_region (start, end, coding_system, destination, 0, 0);
9426 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9427 3, 4, "r\nzCoding system: ",
9428 doc: /* Encode the current region by specified coding system.
9429 When called from a program, takes four arguments:
9430 START, END, CODING-SYSTEM and DESTINATION.
9431 START and END are buffer positions.
9433 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9434 If nil, the region between START and END is replace by the encoded text.
9435 If buffer, the encoded text is inserted in that buffer after point (point
9436 does not move).
9437 In those cases, the length of the encoded text is returned.
9438 If DESTINATION is t, the encoded text is returned.
9440 This function sets `last-coding-system-used' to the precise coding system
9441 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9442 not fully specified.) */)
9443 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9445 return code_convert_region (start, end, coding_system, destination, 1, 0);
9448 Lisp_Object
9449 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9450 Lisp_Object dst_object, bool encodep, bool nocopy,
9451 bool norecord)
9453 struct coding_system coding;
9454 ptrdiff_t chars, bytes;
9456 CHECK_STRING (string);
9457 if (NILP (coding_system))
9459 if (! norecord)
9460 Vlast_coding_system_used = Qno_conversion;
9461 if (NILP (dst_object))
9462 return (nocopy ? Fcopy_sequence (string) : string);
9465 if (NILP (coding_system))
9466 coding_system = Qno_conversion;
9467 else
9468 CHECK_CODING_SYSTEM (coding_system);
9469 if (NILP (dst_object))
9470 dst_object = Qt;
9471 else if (! EQ (dst_object, Qt))
9472 CHECK_BUFFER (dst_object);
9474 setup_coding_system (coding_system, &coding);
9475 coding.mode |= CODING_MODE_LAST_BLOCK;
9476 chars = SCHARS (string);
9477 bytes = SBYTES (string);
9479 if (BUFFERP (dst_object))
9481 struct buffer *buf = XBUFFER (dst_object);
9482 ptrdiff_t buf_pt = BUF_PT (buf);
9484 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9487 if (encodep)
9488 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9489 else
9490 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9491 if (! norecord)
9492 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9494 return (BUFFERP (dst_object)
9495 ? make_number (coding.produced_char)
9496 : coding.dst_object);
9500 /* Encode or decode STRING according to CODING_SYSTEM.
9501 Do not set Vlast_coding_system_used.
9503 This function is called only from macros DECODE_FILE and
9504 ENCODE_FILE, thus we ignore character composition. */
9506 Lisp_Object
9507 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9508 bool encodep)
9510 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9513 /* Encode or decode a file name, to or from a unibyte string suitable
9514 for passing to C library functions. */
9515 Lisp_Object
9516 decode_file_name (Lisp_Object fname)
9518 #ifdef WINDOWSNT
9519 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9520 converts the file names either to UTF-16LE or to the system ANSI
9521 codepage internally, depending on the underlying OS; see w32.c. */
9522 if (! NILP (Fcoding_system_p (Qutf_8)))
9523 return code_convert_string_norecord (fname, Qutf_8, 0);
9524 return fname;
9525 #else /* !WINDOWSNT */
9526 if (! NILP (Vfile_name_coding_system))
9527 return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9528 else if (! NILP (Vdefault_file_name_coding_system))
9529 return code_convert_string_norecord (fname,
9530 Vdefault_file_name_coding_system, 0);
9531 else
9532 return fname;
9533 #endif
9536 Lisp_Object
9537 encode_file_name (Lisp_Object fname)
9539 /* This is especially important during bootstrap and dumping, when
9540 file-name encoding is not yet known, and therefore any non-ASCII
9541 file names are unibyte strings, and could only be thrashed if we
9542 try to encode them. */
9543 if (!STRING_MULTIBYTE (fname))
9544 return fname;
9545 #ifdef WINDOWSNT
9546 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9547 converts the file names either to UTF-16LE or to the system ANSI
9548 codepage internally, depending on the underlying OS; see w32.c. */
9549 if (! NILP (Fcoding_system_p (Qutf_8)))
9550 return code_convert_string_norecord (fname, Qutf_8, 1);
9551 return fname;
9552 #else /* !WINDOWSNT */
9553 if (! NILP (Vfile_name_coding_system))
9554 return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9555 else if (! NILP (Vdefault_file_name_coding_system))
9556 return code_convert_string_norecord (fname,
9557 Vdefault_file_name_coding_system, 1);
9558 else
9559 return fname;
9560 #endif
9563 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9564 2, 4, 0,
9565 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9567 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9568 if the decoding operation is trivial.
9570 Optional fourth arg BUFFER non-nil means that the decoded text is
9571 inserted in that buffer after point (point does not move). In this
9572 case, the return value is the length of the decoded text.
9574 This function sets `last-coding-system-used' to the precise coding system
9575 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9576 not fully specified.) */)
9577 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9579 return code_convert_string (string, coding_system, buffer,
9580 0, ! NILP (nocopy), 0);
9583 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9584 2, 4, 0,
9585 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9587 Optional third arg NOCOPY non-nil means it is OK to return STRING
9588 itself if the encoding operation is trivial.
9590 Optional fourth arg BUFFER non-nil means that the encoded text is
9591 inserted in that buffer after point (point does not move). In this
9592 case, the return value is the length of the encoded text.
9594 This function sets `last-coding-system-used' to the precise coding system
9595 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9596 not fully specified.) */)
9597 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9599 return code_convert_string (string, coding_system, buffer,
9600 1, ! NILP (nocopy), 0);
9604 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9605 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9606 Return the corresponding character. */)
9607 (Lisp_Object code)
9609 Lisp_Object spec, attrs, val;
9610 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9611 EMACS_INT ch;
9612 int c;
9614 CHECK_NATNUM (code);
9615 ch = XFASTINT (code);
9616 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9617 attrs = AREF (spec, 0);
9619 if (ASCII_BYTE_P (ch)
9620 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9621 return code;
9623 val = CODING_ATTR_CHARSET_LIST (attrs);
9624 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9625 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9628 if (ch <= 0x7F)
9630 c = ch;
9631 charset = charset_roman;
9633 else if (ch >= 0xA0 && ch < 0xDF)
9635 c = ch - 0x80;
9636 charset = charset_kana;
9638 else
9640 EMACS_INT c1 = ch >> 8;
9641 int c2 = ch & 0xFF;
9643 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9644 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9645 error ("Invalid code: %"pI"d", ch);
9646 c = ch;
9647 SJIS_TO_JIS (c);
9648 charset = charset_kanji;
9650 c = DECODE_CHAR (charset, c);
9651 if (c < 0)
9652 error ("Invalid code: %"pI"d", ch);
9653 return make_number (c);
9657 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9658 doc: /* Encode a Japanese character CH to shift_jis encoding.
9659 Return the corresponding code in SJIS. */)
9660 (Lisp_Object ch)
9662 Lisp_Object spec, attrs, charset_list;
9663 int c;
9664 struct charset *charset;
9665 unsigned code;
9667 CHECK_CHARACTER (ch);
9668 c = XFASTINT (ch);
9669 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9670 attrs = AREF (spec, 0);
9672 if (ASCII_CHAR_P (c)
9673 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9674 return ch;
9676 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9677 charset = char_charset (c, charset_list, &code);
9678 if (code == CHARSET_INVALID_CODE (charset))
9679 error ("Can't encode by shift_jis encoding: %c", c);
9680 JIS_TO_SJIS (code);
9682 return make_number (code);
9685 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9686 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9687 Return the corresponding character. */)
9688 (Lisp_Object code)
9690 Lisp_Object spec, attrs, val;
9691 struct charset *charset_roman, *charset_big5, *charset;
9692 EMACS_INT ch;
9693 int c;
9695 CHECK_NATNUM (code);
9696 ch = XFASTINT (code);
9697 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9698 attrs = AREF (spec, 0);
9700 if (ASCII_BYTE_P (ch)
9701 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9702 return code;
9704 val = CODING_ATTR_CHARSET_LIST (attrs);
9705 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9706 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9708 if (ch <= 0x7F)
9710 c = ch;
9711 charset = charset_roman;
9713 else
9715 EMACS_INT b1 = ch >> 8;
9716 int b2 = ch & 0x7F;
9717 if (b1 < 0xA1 || b1 > 0xFE
9718 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9719 error ("Invalid code: %"pI"d", ch);
9720 c = ch;
9721 charset = charset_big5;
9723 c = DECODE_CHAR (charset, c);
9724 if (c < 0)
9725 error ("Invalid code: %"pI"d", ch);
9726 return make_number (c);
9729 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9730 doc: /* Encode the Big5 character CH to BIG5 coding system.
9731 Return the corresponding character code in Big5. */)
9732 (Lisp_Object ch)
9734 Lisp_Object spec, attrs, charset_list;
9735 struct charset *charset;
9736 int c;
9737 unsigned code;
9739 CHECK_CHARACTER (ch);
9740 c = XFASTINT (ch);
9741 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9742 attrs = AREF (spec, 0);
9743 if (ASCII_CHAR_P (c)
9744 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9745 return ch;
9747 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9748 charset = char_charset (c, charset_list, &code);
9749 if (code == CHARSET_INVALID_CODE (charset))
9750 error ("Can't encode by Big5 encoding: %c", c);
9752 return make_number (code);
9756 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9757 Sset_terminal_coding_system_internal, 1, 2, 0,
9758 doc: /* Internal use only. */)
9759 (Lisp_Object coding_system, Lisp_Object terminal)
9761 struct terminal *term = get_terminal (terminal, 1);
9762 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9763 CHECK_SYMBOL (coding_system);
9764 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9765 /* We had better not send unsafe characters to terminal. */
9766 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9767 /* Character composition should be disabled. */
9768 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9769 terminal_coding->src_multibyte = 1;
9770 terminal_coding->dst_multibyte = 0;
9771 tset_charset_list
9772 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9773 ? coding_charset_list (terminal_coding)
9774 : list1 (make_number (charset_ascii))));
9775 return Qnil;
9778 DEFUN ("set-safe-terminal-coding-system-internal",
9779 Fset_safe_terminal_coding_system_internal,
9780 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9781 doc: /* Internal use only. */)
9782 (Lisp_Object coding_system)
9784 CHECK_SYMBOL (coding_system);
9785 setup_coding_system (Fcheck_coding_system (coding_system),
9786 &safe_terminal_coding);
9787 /* Character composition should be disabled. */
9788 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9789 safe_terminal_coding.src_multibyte = 1;
9790 safe_terminal_coding.dst_multibyte = 0;
9791 return Qnil;
9794 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9795 Sterminal_coding_system, 0, 1, 0,
9796 doc: /* Return coding system specified for terminal output on the given terminal.
9797 TERMINAL may be a terminal object, a frame, or nil for the selected
9798 frame's terminal device. */)
9799 (Lisp_Object terminal)
9801 struct coding_system *terminal_coding
9802 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9803 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9805 /* For backward compatibility, return nil if it is `undecided'. */
9806 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9809 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9810 Sset_keyboard_coding_system_internal, 1, 2, 0,
9811 doc: /* Internal use only. */)
9812 (Lisp_Object coding_system, Lisp_Object terminal)
9814 struct terminal *t = get_terminal (terminal, 1);
9815 CHECK_SYMBOL (coding_system);
9816 if (NILP (coding_system))
9817 coding_system = Qno_conversion;
9818 else
9819 Fcheck_coding_system (coding_system);
9820 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9821 /* Character composition should be disabled. */
9822 TERMINAL_KEYBOARD_CODING (t)->common_flags
9823 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9824 return Qnil;
9827 DEFUN ("keyboard-coding-system",
9828 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9829 doc: /* Return coding system specified for decoding keyboard input. */)
9830 (Lisp_Object terminal)
9832 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9833 (get_terminal (terminal, 1))->id);
9837 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9838 Sfind_operation_coding_system, 1, MANY, 0,
9839 doc: /* Choose a coding system for an operation based on the target name.
9840 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9841 DECODING-SYSTEM is the coding system to use for decoding
9842 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9843 for encoding (in case OPERATION does encoding).
9845 The first argument OPERATION specifies an I/O primitive:
9846 For file I/O, `insert-file-contents' or `write-region'.
9847 For process I/O, `call-process', `call-process-region', or `start-process'.
9848 For network I/O, `open-network-stream'.
9850 The remaining arguments should be the same arguments that were passed
9851 to the primitive. Depending on which primitive, one of those arguments
9852 is selected as the TARGET. For example, if OPERATION does file I/O,
9853 whichever argument specifies the file name is TARGET.
9855 TARGET has a meaning which depends on OPERATION:
9856 For file I/O, TARGET is a file name (except for the special case below).
9857 For process I/O, TARGET is a process name.
9858 For network I/O, TARGET is a service name or a port number.
9860 This function looks up what is specified for TARGET in
9861 `file-coding-system-alist', `process-coding-system-alist',
9862 or `network-coding-system-alist' depending on OPERATION.
9863 They may specify a coding system, a cons of coding systems,
9864 or a function symbol to call.
9865 In the last case, we call the function with one argument,
9866 which is a list of all the arguments given to this function.
9867 If the function can't decide a coding system, it can return
9868 `undecided' so that the normal code-detection is performed.
9870 If OPERATION is `insert-file-contents', the argument corresponding to
9871 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9872 file name to look up, and BUFFER is a buffer that contains the file's
9873 contents (not yet decoded). If `file-coding-system-alist' specifies a
9874 function to call for FILENAME, that function should examine the
9875 contents of BUFFER instead of reading the file.
9877 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9878 (ptrdiff_t nargs, Lisp_Object *args)
9880 Lisp_Object operation, target_idx, target, val;
9881 register Lisp_Object chain;
9883 if (nargs < 2)
9884 error ("Too few arguments");
9885 operation = args[0];
9886 if (!SYMBOLP (operation)
9887 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9888 error ("Invalid first argument");
9889 if (nargs <= 1 + XFASTINT (target_idx))
9890 error ("Too few arguments for operation `%s'",
9891 SDATA (SYMBOL_NAME (operation)));
9892 target = args[XFASTINT (target_idx) + 1];
9893 if (!(STRINGP (target)
9894 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9895 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9896 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9897 error ("Invalid argument %"pI"d of operation `%s'",
9898 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9899 if (CONSP (target))
9900 target = XCAR (target);
9902 chain = ((EQ (operation, Qinsert_file_contents)
9903 || EQ (operation, Qwrite_region))
9904 ? Vfile_coding_system_alist
9905 : (EQ (operation, Qopen_network_stream)
9906 ? Vnetwork_coding_system_alist
9907 : Vprocess_coding_system_alist));
9908 if (NILP (chain))
9909 return Qnil;
9911 for (; CONSP (chain); chain = XCDR (chain))
9913 Lisp_Object elt;
9915 elt = XCAR (chain);
9916 if (CONSP (elt)
9917 && ((STRINGP (target)
9918 && STRINGP (XCAR (elt))
9919 && fast_string_match (XCAR (elt), target) >= 0)
9920 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9922 val = XCDR (elt);
9923 /* Here, if VAL is both a valid coding system and a valid
9924 function symbol, we return VAL as a coding system. */
9925 if (CONSP (val))
9926 return val;
9927 if (! SYMBOLP (val))
9928 return Qnil;
9929 if (! NILP (Fcoding_system_p (val)))
9930 return Fcons (val, val);
9931 if (! NILP (Ffboundp (val)))
9933 /* We use call1 rather than safe_call1
9934 so as to get bug reports about functions called here
9935 which don't handle the current interface. */
9936 val = call1 (val, Flist (nargs, args));
9937 if (CONSP (val))
9938 return val;
9939 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9940 return Fcons (val, val);
9942 return Qnil;
9945 return Qnil;
9948 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9949 Sset_coding_system_priority, 0, MANY, 0,
9950 doc: /* Assign higher priority to the coding systems given as arguments.
9951 If multiple coding systems belong to the same category,
9952 all but the first one are ignored.
9954 usage: (set-coding-system-priority &rest coding-systems) */)
9955 (ptrdiff_t nargs, Lisp_Object *args)
9957 ptrdiff_t i, j;
9958 bool changed[coding_category_max];
9959 enum coding_category priorities[coding_category_max];
9961 memset (changed, 0, sizeof changed);
9963 for (i = j = 0; i < nargs; i++)
9965 enum coding_category category;
9966 Lisp_Object spec, attrs;
9968 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9969 attrs = AREF (spec, 0);
9970 category = XINT (CODING_ATTR_CATEGORY (attrs));
9971 if (changed[category])
9972 /* Ignore this coding system because a coding system of the
9973 same category already had a higher priority. */
9974 continue;
9975 changed[category] = 1;
9976 priorities[j++] = category;
9977 if (coding_categories[category].id >= 0
9978 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9979 setup_coding_system (args[i], &coding_categories[category]);
9980 Fset (AREF (Vcoding_category_table, category), args[i]);
9983 /* Now we have decided top J priorities. Reflect the order of the
9984 original priorities to the remaining priorities. */
9986 for (i = j, j = 0; i < coding_category_max; i++, j++)
9988 while (j < coding_category_max
9989 && changed[coding_priorities[j]])
9990 j++;
9991 if (j == coding_category_max)
9992 emacs_abort ();
9993 priorities[i] = coding_priorities[j];
9996 memcpy (coding_priorities, priorities, sizeof priorities);
9998 /* Update `coding-category-list'. */
9999 Vcoding_category_list = Qnil;
10000 for (i = coding_category_max; i-- > 0; )
10001 Vcoding_category_list
10002 = Fcons (AREF (Vcoding_category_table, priorities[i]),
10003 Vcoding_category_list);
10005 return Qnil;
10008 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10009 Scoding_system_priority_list, 0, 1, 0,
10010 doc: /* Return a list of coding systems ordered by their priorities.
10011 The list contains a subset of coding systems; i.e. coding systems
10012 assigned to each coding category (see `coding-category-list').
10014 HIGHESTP non-nil means just return the highest priority one. */)
10015 (Lisp_Object highestp)
10017 int i;
10018 Lisp_Object val;
10020 for (i = 0, val = Qnil; i < coding_category_max; i++)
10022 enum coding_category category = coding_priorities[i];
10023 int id = coding_categories[category].id;
10024 Lisp_Object attrs;
10026 if (id < 0)
10027 continue;
10028 attrs = CODING_ID_ATTRS (id);
10029 if (! NILP (highestp))
10030 return CODING_ATTR_BASE_NAME (attrs);
10031 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10033 return Fnreverse (val);
10036 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10038 static Lisp_Object
10039 make_subsidiaries (Lisp_Object base)
10041 Lisp_Object subsidiaries;
10042 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10043 char *buf = alloca (base_name_len + 6);
10044 int i;
10046 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10047 subsidiaries = make_uninit_vector (3);
10048 for (i = 0; i < 3; i++)
10050 strcpy (buf + base_name_len, suffixes[i]);
10051 ASET (subsidiaries, i, intern (buf));
10053 return subsidiaries;
10057 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10058 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10059 doc: /* For internal use only.
10060 usage: (define-coding-system-internal ...) */)
10061 (ptrdiff_t nargs, Lisp_Object *args)
10063 Lisp_Object name;
10064 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
10065 Lisp_Object attrs; /* Vector of attributes. */
10066 Lisp_Object eol_type;
10067 Lisp_Object aliases;
10068 Lisp_Object coding_type, charset_list, safe_charsets;
10069 enum coding_category category;
10070 Lisp_Object tail, val;
10071 int max_charset_id = 0;
10072 int i;
10074 if (nargs < coding_arg_max)
10075 goto short_args;
10077 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10079 name = args[coding_arg_name];
10080 CHECK_SYMBOL (name);
10081 ASET (attrs, coding_attr_base_name, name);
10083 val = args[coding_arg_mnemonic];
10084 if (! STRINGP (val))
10085 CHECK_CHARACTER (val);
10086 ASET (attrs, coding_attr_mnemonic, val);
10088 coding_type = args[coding_arg_coding_type];
10089 CHECK_SYMBOL (coding_type);
10090 ASET (attrs, coding_attr_type, coding_type);
10092 charset_list = args[coding_arg_charset_list];
10093 if (SYMBOLP (charset_list))
10095 if (EQ (charset_list, Qiso_2022))
10097 if (! EQ (coding_type, Qiso_2022))
10098 error ("Invalid charset-list");
10099 charset_list = Viso_2022_charset_list;
10101 else if (EQ (charset_list, Qemacs_mule))
10103 if (! EQ (coding_type, Qemacs_mule))
10104 error ("Invalid charset-list");
10105 charset_list = Vemacs_mule_charset_list;
10107 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10109 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10110 error ("Invalid charset-list");
10111 if (max_charset_id < XFASTINT (XCAR (tail)))
10112 max_charset_id = XFASTINT (XCAR (tail));
10115 else
10117 charset_list = Fcopy_sequence (charset_list);
10118 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10120 struct charset *charset;
10122 val = XCAR (tail);
10123 CHECK_CHARSET_GET_CHARSET (val, charset);
10124 if (EQ (coding_type, Qiso_2022)
10125 ? CHARSET_ISO_FINAL (charset) < 0
10126 : EQ (coding_type, Qemacs_mule)
10127 ? CHARSET_EMACS_MULE_ID (charset) < 0
10128 : 0)
10129 error ("Can't handle charset `%s'",
10130 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10132 XSETCAR (tail, make_number (charset->id));
10133 if (max_charset_id < charset->id)
10134 max_charset_id = charset->id;
10137 ASET (attrs, coding_attr_charset_list, charset_list);
10139 safe_charsets = make_uninit_string (max_charset_id + 1);
10140 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10141 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10142 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10143 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10145 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10147 val = args[coding_arg_decode_translation_table];
10148 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10149 CHECK_SYMBOL (val);
10150 ASET (attrs, coding_attr_decode_tbl, val);
10152 val = args[coding_arg_encode_translation_table];
10153 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10154 CHECK_SYMBOL (val);
10155 ASET (attrs, coding_attr_encode_tbl, val);
10157 val = args[coding_arg_post_read_conversion];
10158 CHECK_SYMBOL (val);
10159 ASET (attrs, coding_attr_post_read, val);
10161 val = args[coding_arg_pre_write_conversion];
10162 CHECK_SYMBOL (val);
10163 ASET (attrs, coding_attr_pre_write, val);
10165 val = args[coding_arg_default_char];
10166 if (NILP (val))
10167 ASET (attrs, coding_attr_default_char, make_number (' '));
10168 else
10170 CHECK_CHARACTER (val);
10171 ASET (attrs, coding_attr_default_char, val);
10174 val = args[coding_arg_for_unibyte];
10175 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10177 val = args[coding_arg_plist];
10178 CHECK_LIST (val);
10179 ASET (attrs, coding_attr_plist, val);
10181 if (EQ (coding_type, Qcharset))
10183 /* Generate a lisp vector of 256 elements. Each element is nil,
10184 integer, or a list of charset IDs.
10186 If Nth element is nil, the byte code N is invalid in this
10187 coding system.
10189 If Nth element is a number NUM, N is the first byte of a
10190 charset whose ID is NUM.
10192 If Nth element is a list of charset IDs, N is the first byte
10193 of one of them. The list is sorted by dimensions of the
10194 charsets. A charset of smaller dimension comes first. */
10195 val = Fmake_vector (make_number (256), Qnil);
10197 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10199 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10200 int dim = CHARSET_DIMENSION (charset);
10201 int idx = (dim - 1) * 4;
10203 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10204 ASET (attrs, coding_attr_ascii_compat, Qt);
10206 for (i = charset->code_space[idx];
10207 i <= charset->code_space[idx + 1]; i++)
10209 Lisp_Object tmp, tmp2;
10210 int dim2;
10212 tmp = AREF (val, i);
10213 if (NILP (tmp))
10214 tmp = XCAR (tail);
10215 else if (NUMBERP (tmp))
10217 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10218 if (dim < dim2)
10219 tmp = list2 (XCAR (tail), tmp);
10220 else
10221 tmp = list2 (tmp, XCAR (tail));
10223 else
10225 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10227 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10228 if (dim < dim2)
10229 break;
10231 if (NILP (tmp2))
10232 tmp = nconc2 (tmp, list1 (XCAR (tail)));
10233 else
10235 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10236 XSETCAR (tmp2, XCAR (tail));
10239 ASET (val, i, tmp);
10242 ASET (attrs, coding_attr_charset_valids, val);
10243 category = coding_category_charset;
10245 else if (EQ (coding_type, Qccl))
10247 Lisp_Object valids;
10249 if (nargs < coding_arg_ccl_max)
10250 goto short_args;
10252 val = args[coding_arg_ccl_decoder];
10253 CHECK_CCL_PROGRAM (val);
10254 if (VECTORP (val))
10255 val = Fcopy_sequence (val);
10256 ASET (attrs, coding_attr_ccl_decoder, val);
10258 val = args[coding_arg_ccl_encoder];
10259 CHECK_CCL_PROGRAM (val);
10260 if (VECTORP (val))
10261 val = Fcopy_sequence (val);
10262 ASET (attrs, coding_attr_ccl_encoder, val);
10264 val = args[coding_arg_ccl_valids];
10265 valids = Fmake_string (make_number (256), make_number (0));
10266 for (tail = val; CONSP (tail); tail = XCDR (tail))
10268 int from, to;
10270 val = XCAR (tail);
10271 if (INTEGERP (val))
10273 if (! (0 <= XINT (val) && XINT (val) <= 255))
10274 args_out_of_range_3 (val, make_number (0), make_number (255));
10275 from = to = XINT (val);
10277 else
10279 CHECK_CONS (val);
10280 CHECK_NATNUM_CAR (val);
10281 CHECK_NUMBER_CDR (val);
10282 if (XINT (XCAR (val)) > 255)
10283 args_out_of_range_3 (XCAR (val),
10284 make_number (0), make_number (255));
10285 from = XINT (XCAR (val));
10286 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10287 args_out_of_range_3 (XCDR (val),
10288 XCAR (val), make_number (255));
10289 to = XINT (XCDR (val));
10291 for (i = from; i <= to; i++)
10292 SSET (valids, i, 1);
10294 ASET (attrs, coding_attr_ccl_valids, valids);
10296 category = coding_category_ccl;
10298 else if (EQ (coding_type, Qutf_16))
10300 Lisp_Object bom, endian;
10302 ASET (attrs, coding_attr_ascii_compat, Qnil);
10304 if (nargs < coding_arg_utf16_max)
10305 goto short_args;
10307 bom = args[coding_arg_utf16_bom];
10308 if (! NILP (bom) && ! EQ (bom, Qt))
10310 CHECK_CONS (bom);
10311 val = XCAR (bom);
10312 CHECK_CODING_SYSTEM (val);
10313 val = XCDR (bom);
10314 CHECK_CODING_SYSTEM (val);
10316 ASET (attrs, coding_attr_utf_bom, bom);
10318 endian = args[coding_arg_utf16_endian];
10319 CHECK_SYMBOL (endian);
10320 if (NILP (endian))
10321 endian = Qbig;
10322 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10323 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10324 ASET (attrs, coding_attr_utf_16_endian, endian);
10326 category = (CONSP (bom)
10327 ? coding_category_utf_16_auto
10328 : NILP (bom)
10329 ? (EQ (endian, Qbig)
10330 ? coding_category_utf_16_be_nosig
10331 : coding_category_utf_16_le_nosig)
10332 : (EQ (endian, Qbig)
10333 ? coding_category_utf_16_be
10334 : coding_category_utf_16_le));
10336 else if (EQ (coding_type, Qiso_2022))
10338 Lisp_Object initial, reg_usage, request, flags;
10340 if (nargs < coding_arg_iso2022_max)
10341 goto short_args;
10343 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10344 CHECK_VECTOR (initial);
10345 for (i = 0; i < 4; i++)
10347 val = AREF (initial, i);
10348 if (! NILP (val))
10350 struct charset *charset;
10352 CHECK_CHARSET_GET_CHARSET (val, charset);
10353 ASET (initial, i, make_number (CHARSET_ID (charset)));
10354 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10355 ASET (attrs, coding_attr_ascii_compat, Qt);
10357 else
10358 ASET (initial, i, make_number (-1));
10361 reg_usage = args[coding_arg_iso2022_reg_usage];
10362 CHECK_CONS (reg_usage);
10363 CHECK_NUMBER_CAR (reg_usage);
10364 CHECK_NUMBER_CDR (reg_usage);
10366 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10367 for (tail = request; CONSP (tail); tail = XCDR (tail))
10369 int id;
10370 Lisp_Object tmp1;
10372 val = XCAR (tail);
10373 CHECK_CONS (val);
10374 tmp1 = XCAR (val);
10375 CHECK_CHARSET_GET_ID (tmp1, id);
10376 CHECK_NATNUM_CDR (val);
10377 if (XINT (XCDR (val)) >= 4)
10378 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10379 XSETCAR (val, make_number (id));
10382 flags = args[coding_arg_iso2022_flags];
10383 CHECK_NATNUM (flags);
10384 i = XINT (flags) & INT_MAX;
10385 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10386 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10387 flags = make_number (i);
10389 ASET (attrs, coding_attr_iso_initial, initial);
10390 ASET (attrs, coding_attr_iso_usage, reg_usage);
10391 ASET (attrs, coding_attr_iso_request, request);
10392 ASET (attrs, coding_attr_iso_flags, flags);
10393 setup_iso_safe_charsets (attrs);
10395 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10396 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10397 | CODING_ISO_FLAG_SINGLE_SHIFT))
10398 ? coding_category_iso_7_else
10399 : EQ (args[coding_arg_charset_list], Qiso_2022)
10400 ? coding_category_iso_7
10401 : coding_category_iso_7_tight);
10402 else
10404 int id = XINT (AREF (initial, 1));
10406 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10407 || EQ (args[coding_arg_charset_list], Qiso_2022)
10408 || id < 0)
10409 ? coding_category_iso_8_else
10410 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10411 ? coding_category_iso_8_1
10412 : coding_category_iso_8_2);
10414 if (category != coding_category_iso_8_1
10415 && category != coding_category_iso_8_2)
10416 ASET (attrs, coding_attr_ascii_compat, Qnil);
10418 else if (EQ (coding_type, Qemacs_mule))
10420 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10421 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10422 ASET (attrs, coding_attr_ascii_compat, Qt);
10423 category = coding_category_emacs_mule;
10425 else if (EQ (coding_type, Qshift_jis))
10428 struct charset *charset;
10430 if (XINT (Flength (charset_list)) != 3
10431 && XINT (Flength (charset_list)) != 4)
10432 error ("There should be three or four charsets");
10434 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10435 if (CHARSET_DIMENSION (charset) != 1)
10436 error ("Dimension of charset %s is not one",
10437 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10438 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10439 ASET (attrs, coding_attr_ascii_compat, Qt);
10441 charset_list = XCDR (charset_list);
10442 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10443 if (CHARSET_DIMENSION (charset) != 1)
10444 error ("Dimension of charset %s is not one",
10445 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447 charset_list = XCDR (charset_list);
10448 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10449 if (CHARSET_DIMENSION (charset) != 2)
10450 error ("Dimension of charset %s is not two",
10451 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10453 charset_list = XCDR (charset_list);
10454 if (! NILP (charset_list))
10456 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10457 if (CHARSET_DIMENSION (charset) != 2)
10458 error ("Dimension of charset %s is not two",
10459 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10462 category = coding_category_sjis;
10463 Vsjis_coding_system = name;
10465 else if (EQ (coding_type, Qbig5))
10467 struct charset *charset;
10469 if (XINT (Flength (charset_list)) != 2)
10470 error ("There should be just two charsets");
10472 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10473 if (CHARSET_DIMENSION (charset) != 1)
10474 error ("Dimension of charset %s is not one",
10475 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10476 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10477 ASET (attrs, coding_attr_ascii_compat, Qt);
10479 charset_list = XCDR (charset_list);
10480 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10481 if (CHARSET_DIMENSION (charset) != 2)
10482 error ("Dimension of charset %s is not two",
10483 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10485 category = coding_category_big5;
10486 Vbig5_coding_system = name;
10488 else if (EQ (coding_type, Qraw_text))
10490 category = coding_category_raw_text;
10491 ASET (attrs, coding_attr_ascii_compat, Qt);
10493 else if (EQ (coding_type, Qutf_8))
10495 Lisp_Object bom;
10497 if (nargs < coding_arg_utf8_max)
10498 goto short_args;
10500 bom = args[coding_arg_utf8_bom];
10501 if (! NILP (bom) && ! EQ (bom, Qt))
10503 CHECK_CONS (bom);
10504 val = XCAR (bom);
10505 CHECK_CODING_SYSTEM (val);
10506 val = XCDR (bom);
10507 CHECK_CODING_SYSTEM (val);
10509 ASET (attrs, coding_attr_utf_bom, bom);
10510 if (NILP (bom))
10511 ASET (attrs, coding_attr_ascii_compat, Qt);
10513 category = (CONSP (bom) ? coding_category_utf_8_auto
10514 : NILP (bom) ? coding_category_utf_8_nosig
10515 : coding_category_utf_8_sig);
10517 else if (EQ (coding_type, Qundecided))
10519 if (nargs < coding_arg_undecided_max)
10520 goto short_args;
10521 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10522 args[coding_arg_undecided_inhibit_null_byte_detection]);
10523 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10524 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10525 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10526 args[coding_arg_undecided_prefer_utf_8]);
10527 category = coding_category_undecided;
10529 else
10530 error ("Invalid coding system type: %s",
10531 SDATA (SYMBOL_NAME (coding_type)));
10533 ASET (attrs, coding_attr_category, make_number (category));
10534 ASET (attrs, coding_attr_plist,
10535 Fcons (QCcategory,
10536 Fcons (AREF (Vcoding_category_table, category),
10537 CODING_ATTR_PLIST (attrs))));
10538 ASET (attrs, coding_attr_plist,
10539 Fcons (QCascii_compatible_p,
10540 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10541 CODING_ATTR_PLIST (attrs))));
10543 eol_type = args[coding_arg_eol_type];
10544 if (! NILP (eol_type)
10545 && ! EQ (eol_type, Qunix)
10546 && ! EQ (eol_type, Qdos)
10547 && ! EQ (eol_type, Qmac))
10548 error ("Invalid eol-type");
10550 aliases = list1 (name);
10552 if (NILP (eol_type))
10554 eol_type = make_subsidiaries (name);
10555 for (i = 0; i < 3; i++)
10557 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10559 this_name = AREF (eol_type, i);
10560 this_aliases = list1 (this_name);
10561 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10562 this_spec = make_uninit_vector (3);
10563 ASET (this_spec, 0, attrs);
10564 ASET (this_spec, 1, this_aliases);
10565 ASET (this_spec, 2, this_eol_type);
10566 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10567 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10568 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10569 if (NILP (val))
10570 Vcoding_system_alist
10571 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10572 Vcoding_system_alist);
10576 spec_vec = make_uninit_vector (3);
10577 ASET (spec_vec, 0, attrs);
10578 ASET (spec_vec, 1, aliases);
10579 ASET (spec_vec, 2, eol_type);
10581 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10582 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10583 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10584 if (NILP (val))
10585 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10586 Vcoding_system_alist);
10589 int id = coding_categories[category].id;
10591 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10592 setup_coding_system (name, &coding_categories[category]);
10595 return Qnil;
10597 short_args:
10598 return Fsignal (Qwrong_number_of_arguments,
10599 Fcons (intern ("define-coding-system-internal"),
10600 make_number (nargs)));
10604 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10605 3, 3, 0,
10606 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10607 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10609 Lisp_Object spec, attrs;
10611 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10612 attrs = AREF (spec, 0);
10613 if (EQ (prop, QCmnemonic))
10615 if (! STRINGP (val))
10616 CHECK_CHARACTER (val);
10617 ASET (attrs, coding_attr_mnemonic, val);
10619 else if (EQ (prop, QCdefault_char))
10621 if (NILP (val))
10622 val = make_number (' ');
10623 else
10624 CHECK_CHARACTER (val);
10625 ASET (attrs, coding_attr_default_char, val);
10627 else if (EQ (prop, QCdecode_translation_table))
10629 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10630 CHECK_SYMBOL (val);
10631 ASET (attrs, coding_attr_decode_tbl, val);
10633 else if (EQ (prop, QCencode_translation_table))
10635 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10636 CHECK_SYMBOL (val);
10637 ASET (attrs, coding_attr_encode_tbl, val);
10639 else if (EQ (prop, QCpost_read_conversion))
10641 CHECK_SYMBOL (val);
10642 ASET (attrs, coding_attr_post_read, val);
10644 else if (EQ (prop, QCpre_write_conversion))
10646 CHECK_SYMBOL (val);
10647 ASET (attrs, coding_attr_pre_write, val);
10649 else if (EQ (prop, QCascii_compatible_p))
10651 ASET (attrs, coding_attr_ascii_compat, val);
10654 ASET (attrs, coding_attr_plist,
10655 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10656 return val;
10660 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10661 Sdefine_coding_system_alias, 2, 2, 0,
10662 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10663 (Lisp_Object alias, Lisp_Object coding_system)
10665 Lisp_Object spec, aliases, eol_type, val;
10667 CHECK_SYMBOL (alias);
10668 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10669 aliases = AREF (spec, 1);
10670 /* ALIASES should be a list of length more than zero, and the first
10671 element is a base coding system. Append ALIAS at the tail of the
10672 list. */
10673 while (!NILP (XCDR (aliases)))
10674 aliases = XCDR (aliases);
10675 XSETCDR (aliases, list1 (alias));
10677 eol_type = AREF (spec, 2);
10678 if (VECTORP (eol_type))
10680 Lisp_Object subsidiaries;
10681 int i;
10683 subsidiaries = make_subsidiaries (alias);
10684 for (i = 0; i < 3; i++)
10685 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10686 AREF (eol_type, i));
10689 Fputhash (alias, spec, Vcoding_system_hash_table);
10690 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10691 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10692 if (NILP (val))
10693 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10694 Vcoding_system_alist);
10696 return Qnil;
10699 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10700 1, 1, 0,
10701 doc: /* Return the base of CODING-SYSTEM.
10702 Any alias or subsidiary coding system is not a base coding system. */)
10703 (Lisp_Object coding_system)
10705 Lisp_Object spec, attrs;
10707 if (NILP (coding_system))
10708 return (Qno_conversion);
10709 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10710 attrs = AREF (spec, 0);
10711 return CODING_ATTR_BASE_NAME (attrs);
10714 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10715 1, 1, 0,
10716 doc: "Return the property list of CODING-SYSTEM.")
10717 (Lisp_Object coding_system)
10719 Lisp_Object spec, attrs;
10721 if (NILP (coding_system))
10722 coding_system = Qno_conversion;
10723 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10724 attrs = AREF (spec, 0);
10725 return CODING_ATTR_PLIST (attrs);
10729 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10730 1, 1, 0,
10731 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10732 (Lisp_Object coding_system)
10734 Lisp_Object spec;
10736 if (NILP (coding_system))
10737 coding_system = Qno_conversion;
10738 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10739 return AREF (spec, 1);
10742 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10743 Scoding_system_eol_type, 1, 1, 0,
10744 doc: /* Return eol-type of CODING-SYSTEM.
10745 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10747 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10748 and CR respectively.
10750 A vector value indicates that a format of end-of-line should be
10751 detected automatically. Nth element of the vector is the subsidiary
10752 coding system whose eol-type is N. */)
10753 (Lisp_Object coding_system)
10755 Lisp_Object spec, eol_type;
10756 int n;
10758 if (NILP (coding_system))
10759 coding_system = Qno_conversion;
10760 if (! CODING_SYSTEM_P (coding_system))
10761 return Qnil;
10762 spec = CODING_SYSTEM_SPEC (coding_system);
10763 eol_type = AREF (spec, 2);
10764 if (VECTORP (eol_type))
10765 return Fcopy_sequence (eol_type);
10766 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10767 return make_number (n);
10770 #endif /* emacs */
10773 /*** 9. Post-amble ***/
10775 void
10776 init_coding_once (void)
10778 int i;
10780 for (i = 0; i < coding_category_max; i++)
10782 coding_categories[i].id = -1;
10783 coding_priorities[i] = i;
10786 /* ISO2022 specific initialize routine. */
10787 for (i = 0; i < 0x20; i++)
10788 iso_code_class[i] = ISO_control_0;
10789 for (i = 0x21; i < 0x7F; i++)
10790 iso_code_class[i] = ISO_graphic_plane_0;
10791 for (i = 0x80; i < 0xA0; i++)
10792 iso_code_class[i] = ISO_control_1;
10793 for (i = 0xA1; i < 0xFF; i++)
10794 iso_code_class[i] = ISO_graphic_plane_1;
10795 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10796 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10797 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10798 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10799 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10800 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10801 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10802 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10803 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10805 for (i = 0; i < 256; i++)
10807 emacs_mule_bytes[i] = 1;
10809 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10810 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10811 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10812 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10815 #ifdef emacs
10817 void
10818 syms_of_coding (void)
10820 staticpro (&Vcoding_system_hash_table);
10822 Lisp_Object args[2];
10823 args[0] = QCtest;
10824 args[1] = Qeq;
10825 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10828 staticpro (&Vsjis_coding_system);
10829 Vsjis_coding_system = Qnil;
10831 staticpro (&Vbig5_coding_system);
10832 Vbig5_coding_system = Qnil;
10834 staticpro (&Vcode_conversion_reused_workbuf);
10835 Vcode_conversion_reused_workbuf = Qnil;
10837 staticpro (&Vcode_conversion_workbuf_name);
10838 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10840 reused_workbuf_in_use = 0;
10842 DEFSYM (Qcharset, "charset");
10843 DEFSYM (Qtarget_idx, "target-idx");
10844 DEFSYM (Qcoding_system_history, "coding-system-history");
10845 Fset (Qcoding_system_history, Qnil);
10847 /* Target FILENAME is the first argument. */
10848 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10849 /* Target FILENAME is the third argument. */
10850 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10852 DEFSYM (Qcall_process, "call-process");
10853 /* Target PROGRAM is the first argument. */
10854 Fput (Qcall_process, Qtarget_idx, make_number (0));
10856 DEFSYM (Qcall_process_region, "call-process-region");
10857 /* Target PROGRAM is the third argument. */
10858 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10860 DEFSYM (Qstart_process, "start-process");
10861 /* Target PROGRAM is the third argument. */
10862 Fput (Qstart_process, Qtarget_idx, make_number (2));
10864 DEFSYM (Qopen_network_stream, "open-network-stream");
10865 /* Target SERVICE is the fourth argument. */
10866 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10868 DEFSYM (Qcoding_system, "coding-system");
10869 DEFSYM (Qcoding_aliases, "coding-aliases");
10871 DEFSYM (Qeol_type, "eol-type");
10872 DEFSYM (Qunix, "unix");
10873 DEFSYM (Qdos, "dos");
10874 DEFSYM (Qmac, "mac");
10876 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10877 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10878 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10879 DEFSYM (Qdefault_char, "default-char");
10880 DEFSYM (Qundecided, "undecided");
10881 DEFSYM (Qno_conversion, "no-conversion");
10882 DEFSYM (Qraw_text, "raw-text");
10884 DEFSYM (Qiso_2022, "iso-2022");
10886 DEFSYM (Qutf_8, "utf-8");
10887 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10889 #if defined (WINDOWSNT) || defined (CYGWIN)
10890 /* No, not utf-16-le: that one has a BOM. */
10891 DEFSYM (Qutf_16le, "utf-16le");
10892 #endif
10894 DEFSYM (Qutf_16, "utf-16");
10895 DEFSYM (Qbig, "big");
10896 DEFSYM (Qlittle, "little");
10898 DEFSYM (Qshift_jis, "shift-jis");
10899 DEFSYM (Qbig5, "big5");
10901 DEFSYM (Qcoding_system_p, "coding-system-p");
10903 DEFSYM (Qcoding_system_error, "coding-system-error");
10904 Fput (Qcoding_system_error, Qerror_conditions,
10905 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10906 Fput (Qcoding_system_error, Qerror_message,
10907 build_pure_c_string ("Invalid coding system"));
10909 DEFSYM (Qtranslation_table, "translation-table");
10910 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10911 DEFSYM (Qtranslation_table_id, "translation-table-id");
10912 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10913 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10915 DEFSYM (Qvalid_codes, "valid-codes");
10917 DEFSYM (Qemacs_mule, "emacs-mule");
10919 DEFSYM (QCcategory, ":category");
10920 DEFSYM (QCmnemonic, ":mnemonic");
10921 DEFSYM (QCdefault_char, ":default-char");
10922 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10923 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10924 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10925 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10926 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10928 Vcoding_category_table
10929 = Fmake_vector (make_number (coding_category_max), Qnil);
10930 staticpro (&Vcoding_category_table);
10931 /* Followings are target of code detection. */
10932 ASET (Vcoding_category_table, coding_category_iso_7,
10933 intern_c_string ("coding-category-iso-7"));
10934 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10935 intern_c_string ("coding-category-iso-7-tight"));
10936 ASET (Vcoding_category_table, coding_category_iso_8_1,
10937 intern_c_string ("coding-category-iso-8-1"));
10938 ASET (Vcoding_category_table, coding_category_iso_8_2,
10939 intern_c_string ("coding-category-iso-8-2"));
10940 ASET (Vcoding_category_table, coding_category_iso_7_else,
10941 intern_c_string ("coding-category-iso-7-else"));
10942 ASET (Vcoding_category_table, coding_category_iso_8_else,
10943 intern_c_string ("coding-category-iso-8-else"));
10944 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10945 intern_c_string ("coding-category-utf-8-auto"));
10946 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10947 intern_c_string ("coding-category-utf-8"));
10948 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10949 intern_c_string ("coding-category-utf-8-sig"));
10950 ASET (Vcoding_category_table, coding_category_utf_16_be,
10951 intern_c_string ("coding-category-utf-16-be"));
10952 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10953 intern_c_string ("coding-category-utf-16-auto"));
10954 ASET (Vcoding_category_table, coding_category_utf_16_le,
10955 intern_c_string ("coding-category-utf-16-le"));
10956 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10957 intern_c_string ("coding-category-utf-16-be-nosig"));
10958 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10959 intern_c_string ("coding-category-utf-16-le-nosig"));
10960 ASET (Vcoding_category_table, coding_category_charset,
10961 intern_c_string ("coding-category-charset"));
10962 ASET (Vcoding_category_table, coding_category_sjis,
10963 intern_c_string ("coding-category-sjis"));
10964 ASET (Vcoding_category_table, coding_category_big5,
10965 intern_c_string ("coding-category-big5"));
10966 ASET (Vcoding_category_table, coding_category_ccl,
10967 intern_c_string ("coding-category-ccl"));
10968 ASET (Vcoding_category_table, coding_category_emacs_mule,
10969 intern_c_string ("coding-category-emacs-mule"));
10970 /* Followings are NOT target of code detection. */
10971 ASET (Vcoding_category_table, coding_category_raw_text,
10972 intern_c_string ("coding-category-raw-text"));
10973 ASET (Vcoding_category_table, coding_category_undecided,
10974 intern_c_string ("coding-category-undecided"));
10976 DEFSYM (Qinsufficient_source, "insufficient-source");
10977 DEFSYM (Qinvalid_source, "invalid-source");
10978 DEFSYM (Qinterrupted, "interrupted");
10979 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10981 defsubr (&Scoding_system_p);
10982 defsubr (&Sread_coding_system);
10983 defsubr (&Sread_non_nil_coding_system);
10984 defsubr (&Scheck_coding_system);
10985 defsubr (&Sdetect_coding_region);
10986 defsubr (&Sdetect_coding_string);
10987 defsubr (&Sfind_coding_systems_region_internal);
10988 defsubr (&Sunencodable_char_position);
10989 defsubr (&Scheck_coding_systems_region);
10990 defsubr (&Sdecode_coding_region);
10991 defsubr (&Sencode_coding_region);
10992 defsubr (&Sdecode_coding_string);
10993 defsubr (&Sencode_coding_string);
10994 defsubr (&Sdecode_sjis_char);
10995 defsubr (&Sencode_sjis_char);
10996 defsubr (&Sdecode_big5_char);
10997 defsubr (&Sencode_big5_char);
10998 defsubr (&Sset_terminal_coding_system_internal);
10999 defsubr (&Sset_safe_terminal_coding_system_internal);
11000 defsubr (&Sterminal_coding_system);
11001 defsubr (&Sset_keyboard_coding_system_internal);
11002 defsubr (&Skeyboard_coding_system);
11003 defsubr (&Sfind_operation_coding_system);
11004 defsubr (&Sset_coding_system_priority);
11005 defsubr (&Sdefine_coding_system_internal);
11006 defsubr (&Sdefine_coding_system_alias);
11007 defsubr (&Scoding_system_put);
11008 defsubr (&Scoding_system_base);
11009 defsubr (&Scoding_system_plist);
11010 defsubr (&Scoding_system_aliases);
11011 defsubr (&Scoding_system_eol_type);
11012 defsubr (&Scoding_system_priority_list);
11014 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11015 doc: /* List of coding systems.
11017 Do not alter the value of this variable manually. This variable should be
11018 updated by the functions `define-coding-system' and
11019 `define-coding-system-alias'. */);
11020 Vcoding_system_list = Qnil;
11022 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11023 doc: /* Alist of coding system names.
11024 Each element is one element list of coding system name.
11025 This variable is given to `completing-read' as COLLECTION argument.
11027 Do not alter the value of this variable manually. This variable should be
11028 updated by the functions `make-coding-system' and
11029 `define-coding-system-alias'. */);
11030 Vcoding_system_alist = Qnil;
11032 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11033 doc: /* List of coding-categories (symbols) ordered by priority.
11035 On detecting a coding system, Emacs tries code detection algorithms
11036 associated with each coding-category one by one in this order. When
11037 one algorithm agrees with a byte sequence of source text, the coding
11038 system bound to the corresponding coding-category is selected.
11040 Don't modify this variable directly, but use `set-coding-system-priority'. */);
11042 int i;
11044 Vcoding_category_list = Qnil;
11045 for (i = coding_category_max - 1; i >= 0; i--)
11046 Vcoding_category_list
11047 = Fcons (AREF (Vcoding_category_table, i),
11048 Vcoding_category_list);
11051 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11052 doc: /* Specify the coding system for read operations.
11053 It is useful to bind this variable with `let', but do not set it globally.
11054 If the value is a coding system, it is used for decoding on read operation.
11055 If not, an appropriate element is used from one of the coding system alists.
11056 There are three such tables: `file-coding-system-alist',
11057 `process-coding-system-alist', and `network-coding-system-alist'. */);
11058 Vcoding_system_for_read = Qnil;
11060 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11061 doc: /* Specify the coding system for write operations.
11062 Programs bind this variable with `let', but you should not set it globally.
11063 If the value is a coding system, it is used for encoding of output,
11064 when writing it to a file and when sending it to a file or subprocess.
11066 If this does not specify a coding system, an appropriate element
11067 is used from one of the coding system alists.
11068 There are three such tables: `file-coding-system-alist',
11069 `process-coding-system-alist', and `network-coding-system-alist'.
11070 For output to files, if the above procedure does not specify a coding system,
11071 the value of `buffer-file-coding-system' is used. */);
11072 Vcoding_system_for_write = Qnil;
11074 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11075 doc: /*
11076 Coding system used in the latest file or process I/O. */);
11077 Vlast_coding_system_used = Qnil;
11079 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11080 doc: /*
11081 Error status of the last code conversion.
11083 When an error was detected in the last code conversion, this variable
11084 is set to one of the following symbols.
11085 `insufficient-source'
11086 `inconsistent-eol'
11087 `invalid-source'
11088 `interrupted'
11089 `insufficient-memory'
11090 When no error was detected, the value doesn't change. So, to check
11091 the error status of a code conversion by this variable, you must
11092 explicitly set this variable to nil before performing code
11093 conversion. */);
11094 Vlast_code_conversion_error = Qnil;
11096 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11097 doc: /*
11098 *Non-nil means always inhibit code conversion of end-of-line format.
11099 See info node `Coding Systems' and info node `Text and Binary' concerning
11100 such conversion. */);
11101 inhibit_eol_conversion = 0;
11103 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11104 doc: /*
11105 Non-nil means process buffer inherits coding system of process output.
11106 Bind it to t if the process output is to be treated as if it were a file
11107 read from some filesystem. */);
11108 inherit_process_coding_system = 0;
11110 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11111 doc: /*
11112 Alist to decide a coding system to use for a file I/O operation.
11113 The format is ((PATTERN . VAL) ...),
11114 where PATTERN is a regular expression matching a file name,
11115 VAL is a coding system, a cons of coding systems, or a function symbol.
11116 If VAL is a coding system, it is used for both decoding and encoding
11117 the file contents.
11118 If VAL is a cons of coding systems, the car part is used for decoding,
11119 and the cdr part is used for encoding.
11120 If VAL is a function symbol, the function must return a coding system
11121 or a cons of coding systems which are used as above. The function is
11122 called with an argument that is a list of the arguments with which
11123 `find-operation-coding-system' was called. If the function can't decide
11124 a coding system, it can return `undecided' so that the normal
11125 code-detection is performed.
11127 See also the function `find-operation-coding-system'
11128 and the variable `auto-coding-alist'. */);
11129 Vfile_coding_system_alist = Qnil;
11131 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11132 doc: /*
11133 Alist to decide a coding system to use for a process I/O operation.
11134 The format is ((PATTERN . VAL) ...),
11135 where PATTERN is a regular expression matching a program name,
11136 VAL is a coding system, a cons of coding systems, or a function symbol.
11137 If VAL is a coding system, it is used for both decoding what received
11138 from the program and encoding what sent to the program.
11139 If VAL is a cons of coding systems, the car part is used for decoding,
11140 and the cdr part is used for encoding.
11141 If VAL is a function symbol, the function must return a coding system
11142 or a cons of coding systems which are used as above.
11144 See also the function `find-operation-coding-system'. */);
11145 Vprocess_coding_system_alist = Qnil;
11147 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11148 doc: /*
11149 Alist to decide a coding system to use for a network I/O operation.
11150 The format is ((PATTERN . VAL) ...),
11151 where PATTERN is a regular expression matching a network service name
11152 or is a port number to connect to,
11153 VAL is a coding system, a cons of coding systems, or a function symbol.
11154 If VAL is a coding system, it is used for both decoding what received
11155 from the network stream and encoding what sent to the network stream.
11156 If VAL is a cons of coding systems, the car part is used for decoding,
11157 and the cdr part is used for encoding.
11158 If VAL is a function symbol, the function must return a coding system
11159 or a cons of coding systems which are used as above.
11161 See also the function `find-operation-coding-system'. */);
11162 Vnetwork_coding_system_alist = Qnil;
11164 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11165 doc: /* Coding system to use with system messages.
11166 Also used for decoding keyboard input on X Window system. */);
11167 Vlocale_coding_system = Qnil;
11169 /* The eol mnemonics are reset in startup.el system-dependently. */
11170 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11171 doc: /*
11172 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11173 eol_mnemonic_unix = build_pure_c_string (":");
11175 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11176 doc: /*
11177 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11178 eol_mnemonic_dos = build_pure_c_string ("\\");
11180 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11181 doc: /*
11182 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11183 eol_mnemonic_mac = build_pure_c_string ("/");
11185 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11186 doc: /*
11187 *String displayed in mode line when end-of-line format is not yet determined. */);
11188 eol_mnemonic_undecided = build_pure_c_string (":");
11190 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11191 doc: /*
11192 *Non-nil enables character translation while encoding and decoding. */);
11193 Venable_character_translation = Qt;
11195 DEFVAR_LISP ("standard-translation-table-for-decode",
11196 Vstandard_translation_table_for_decode,
11197 doc: /* Table for translating characters while decoding. */);
11198 Vstandard_translation_table_for_decode = Qnil;
11200 DEFVAR_LISP ("standard-translation-table-for-encode",
11201 Vstandard_translation_table_for_encode,
11202 doc: /* Table for translating characters while encoding. */);
11203 Vstandard_translation_table_for_encode = Qnil;
11205 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11206 doc: /* Alist of charsets vs revision numbers.
11207 While encoding, if a charset (car part of an element) is found,
11208 designate it with the escape sequence identifying revision (cdr part
11209 of the element). */);
11210 Vcharset_revision_table = Qnil;
11212 DEFVAR_LISP ("default-process-coding-system",
11213 Vdefault_process_coding_system,
11214 doc: /* Cons of coding systems used for process I/O by default.
11215 The car part is used for decoding a process output,
11216 the cdr part is used for encoding a text to be sent to a process. */);
11217 Vdefault_process_coding_system = Qnil;
11219 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11220 doc: /*
11221 Table of extra Latin codes in the range 128..159 (inclusive).
11222 This is a vector of length 256.
11223 If Nth element is non-nil, the existence of code N in a file
11224 \(or output of subprocess) doesn't prevent it to be detected as
11225 a coding system of ISO 2022 variant which has a flag
11226 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11227 or reading output of a subprocess.
11228 Only 128th through 159th elements have a meaning. */);
11229 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11231 DEFVAR_LISP ("select-safe-coding-system-function",
11232 Vselect_safe_coding_system_function,
11233 doc: /*
11234 Function to call to select safe coding system for encoding a text.
11236 If set, this function is called to force a user to select a proper
11237 coding system which can encode the text in the case that a default
11238 coding system used in each operation can't encode the text. The
11239 function should take care that the buffer is not modified while
11240 the coding system is being selected.
11242 The default value is `select-safe-coding-system' (which see). */);
11243 Vselect_safe_coding_system_function = Qnil;
11245 DEFVAR_BOOL ("coding-system-require-warning",
11246 coding_system_require_warning,
11247 doc: /* Internal use only.
11248 If non-nil, on writing a file, `select-safe-coding-system-function' is
11249 called even if `coding-system-for-write' is non-nil. The command
11250 `universal-coding-system-argument' binds this variable to t temporarily. */);
11251 coding_system_require_warning = 0;
11254 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11255 inhibit_iso_escape_detection,
11256 doc: /*
11257 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11259 When Emacs reads text, it tries to detect how the text is encoded.
11260 This code detection is sensitive to escape sequences. If Emacs sees
11261 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11262 of the ISO2022 encodings, and decodes text by the corresponding coding
11263 system (e.g. `iso-2022-7bit').
11265 However, there may be a case that you want to read escape sequences in
11266 a file as is. In such a case, you can set this variable to non-nil.
11267 Then the code detection will ignore any escape sequences, and no text is
11268 detected as encoded in some ISO-2022 encoding. The result is that all
11269 escape sequences become visible in a buffer.
11271 The default value is nil, and it is strongly recommended not to change
11272 it. That is because many Emacs Lisp source files that contain
11273 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11274 in Emacs's distribution, and they won't be decoded correctly on
11275 reading if you suppress escape sequence detection.
11277 The other way to read escape sequences in a file without decoding is
11278 to explicitly specify some coding system that doesn't use ISO-2022
11279 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11280 inhibit_iso_escape_detection = 0;
11282 DEFVAR_BOOL ("inhibit-null-byte-detection",
11283 inhibit_null_byte_detection,
11284 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11285 By default, Emacs treats it as binary data, and does not attempt to
11286 decode it. The effect is as if you specified `no-conversion' for
11287 reading that text.
11289 Set this to non-nil when a regular text happens to include null bytes.
11290 Examples are Index nodes of Info files and null-byte delimited output
11291 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11292 decode text as usual. */);
11293 inhibit_null_byte_detection = 0;
11295 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11296 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11297 Internal use only. Remove after the experimental optimizer becomes stable. */);
11298 disable_ascii_optimization = 0;
11300 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11301 doc: /* Char table for translating self-inserting characters.
11302 This is applied to the result of input methods, not their input.
11303 See also `keyboard-translate-table'.
11305 Use of this variable for character code unification was rendered
11306 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11307 internal character representation. */);
11308 Vtranslation_table_for_input = Qnil;
11311 Lisp_Object args[coding_arg_undecided_max];
11312 Lisp_Object plist[16];
11313 int i;
11315 for (i = 0; i < coding_arg_undecided_max; i++)
11316 args[i] = Qnil;
11318 plist[0] = intern_c_string (":name");
11319 plist[1] = args[coding_arg_name] = Qno_conversion;
11320 plist[2] = intern_c_string (":mnemonic");
11321 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11322 plist[4] = intern_c_string (":coding-type");
11323 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11324 plist[6] = intern_c_string (":ascii-compatible-p");
11325 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11326 plist[8] = intern_c_string (":default-char");
11327 plist[9] = args[coding_arg_default_char] = make_number (0);
11328 plist[10] = intern_c_string (":for-unibyte");
11329 plist[11] = args[coding_arg_for_unibyte] = Qt;
11330 plist[12] = intern_c_string (":docstring");
11331 plist[13] = build_pure_c_string ("Do no conversion.\n\
11333 When you visit a file with this coding, the file is read into a\n\
11334 unibyte buffer as is, thus each byte of a file is treated as a\n\
11335 character.");
11336 plist[14] = intern_c_string (":eol-type");
11337 plist[15] = args[coding_arg_eol_type] = Qunix;
11338 args[coding_arg_plist] = Flist (16, plist);
11339 Fdefine_coding_system_internal (coding_arg_max, args);
11341 plist[1] = args[coding_arg_name] = Qundecided;
11342 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11343 plist[5] = args[coding_arg_coding_type] = Qundecided;
11344 /* This is already set.
11345 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11346 plist[8] = intern_c_string (":charset-list");
11347 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11348 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11349 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11350 plist[15] = args[coding_arg_eol_type] = Qnil;
11351 args[coding_arg_plist] = Flist (16, plist);
11352 args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11353 args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11354 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11357 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11360 int i;
11362 for (i = 0; i < coding_category_max; i++)
11363 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11365 #if defined (DOS_NT)
11366 system_eol_type = Qdos;
11367 #else
11368 system_eol_type = Qunix;
11369 #endif
11370 staticpro (&system_eol_type);
11373 char *
11374 emacs_strerror (int error_number)
11376 char *str;
11378 synchronize_system_messages_locale ();
11379 str = strerror (error_number);
11381 if (! NILP (Vlocale_coding_system))
11383 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11384 Vlocale_coding_system,
11386 str = SSDATA (dec);
11389 return str;
11392 #endif /* emacs */