Fix `eww-current-source' buffer confustion
[emacs.git] / src / coding.c
blobe4b52f6db484cd0db79666ec9f9413b01060d9af
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2014 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_LEVEL_4 0x20000
498 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
500 /* A character to be produced on output if encoding of the original
501 character is prohibited by CODING_ISO_FLAG_SAFE. */
502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
504 /* UTF-8 section */
505 #define CODING_UTF_8_BOM(coding) \
506 ((coding)->spec.utf_8_bom)
508 /* UTF-16 section */
509 #define CODING_UTF_16_BOM(coding) \
510 ((coding)->spec.utf_16.bom)
512 #define CODING_UTF_16_ENDIAN(coding) \
513 ((coding)->spec.utf_16.endian)
515 #define CODING_UTF_16_SURROGATE(coding) \
516 ((coding)->spec.utf_16.surrogate)
519 /* CCL section */
520 #define CODING_CCL_DECODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
522 #define CODING_CCL_ENCODER(coding) \
523 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
524 #define CODING_CCL_VALIDS(coding) \
525 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
527 /* Index for each coding category in `coding_categories' */
529 enum coding_category
531 coding_category_iso_7,
532 coding_category_iso_7_tight,
533 coding_category_iso_8_1,
534 coding_category_iso_8_2,
535 coding_category_iso_7_else,
536 coding_category_iso_8_else,
537 coding_category_utf_8_auto,
538 coding_category_utf_8_nosig,
539 coding_category_utf_8_sig,
540 coding_category_utf_16_auto,
541 coding_category_utf_16_be,
542 coding_category_utf_16_le,
543 coding_category_utf_16_be_nosig,
544 coding_category_utf_16_le_nosig,
545 coding_category_charset,
546 coding_category_sjis,
547 coding_category_big5,
548 coding_category_ccl,
549 coding_category_emacs_mule,
550 /* All above are targets of code detection. */
551 coding_category_raw_text,
552 coding_category_undecided,
553 coding_category_max
556 /* Definitions of flag bits used in detect_coding_XXXX. */
557 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
558 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
559 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
560 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
561 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
562 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
563 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
564 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
565 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
566 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
567 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
568 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
569 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
570 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
571 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
572 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
573 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
574 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
575 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
576 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
578 /* This value is returned if detect_coding_mask () find nothing other
579 than ASCII characters. */
580 #define CATEGORY_MASK_ANY \
581 (CATEGORY_MASK_ISO_7 \
582 | CATEGORY_MASK_ISO_7_TIGHT \
583 | CATEGORY_MASK_ISO_8_1 \
584 | CATEGORY_MASK_ISO_8_2 \
585 | CATEGORY_MASK_ISO_7_ELSE \
586 | CATEGORY_MASK_ISO_8_ELSE \
587 | CATEGORY_MASK_UTF_8_AUTO \
588 | CATEGORY_MASK_UTF_8_NOSIG \
589 | CATEGORY_MASK_UTF_8_SIG \
590 | CATEGORY_MASK_UTF_16_AUTO \
591 | CATEGORY_MASK_UTF_16_BE \
592 | CATEGORY_MASK_UTF_16_LE \
593 | CATEGORY_MASK_UTF_16_BE_NOSIG \
594 | CATEGORY_MASK_UTF_16_LE_NOSIG \
595 | CATEGORY_MASK_CHARSET \
596 | CATEGORY_MASK_SJIS \
597 | CATEGORY_MASK_BIG5 \
598 | CATEGORY_MASK_CCL \
599 | CATEGORY_MASK_EMACS_MULE)
602 #define CATEGORY_MASK_ISO_7BIT \
603 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
605 #define CATEGORY_MASK_ISO_8BIT \
606 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
608 #define CATEGORY_MASK_ISO_ELSE \
609 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
611 #define CATEGORY_MASK_ISO_ESCAPE \
612 (CATEGORY_MASK_ISO_7 \
613 | CATEGORY_MASK_ISO_7_TIGHT \
614 | CATEGORY_MASK_ISO_7_ELSE \
615 | CATEGORY_MASK_ISO_8_ELSE)
617 #define CATEGORY_MASK_ISO \
618 ( CATEGORY_MASK_ISO_7BIT \
619 | CATEGORY_MASK_ISO_8BIT \
620 | CATEGORY_MASK_ISO_ELSE)
622 #define CATEGORY_MASK_UTF_16 \
623 (CATEGORY_MASK_UTF_16_AUTO \
624 | CATEGORY_MASK_UTF_16_BE \
625 | CATEGORY_MASK_UTF_16_LE \
626 | CATEGORY_MASK_UTF_16_BE_NOSIG \
627 | CATEGORY_MASK_UTF_16_LE_NOSIG)
629 #define CATEGORY_MASK_UTF_8 \
630 (CATEGORY_MASK_UTF_8_AUTO \
631 | CATEGORY_MASK_UTF_8_NOSIG \
632 | CATEGORY_MASK_UTF_8_SIG)
634 /* Table of coding categories (Lisp symbols). This variable is for
635 internal use only. */
636 static Lisp_Object Vcoding_category_table;
638 /* Table of coding-categories ordered by priority. */
639 static enum coding_category coding_priorities[coding_category_max];
641 /* Nth element is a coding context for the coding system bound to the
642 Nth coding category. */
643 static struct coding_system coding_categories[coding_category_max];
645 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
647 static int
648 encode_inhibit_flag (Lisp_Object flag)
650 return NILP (flag) ? -1 : EQ (flag, Qt);
653 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
654 1 means yes, -1 means no, 0 means ask the user variable VAR. */
656 static bool
657 inhibit_flag (int encoded_flag, bool var)
659 return 0 < encoded_flag + var;
662 #define CODING_GET_INFO(coding, attrs, charset_list) \
663 do { \
664 (attrs) = CODING_ID_ATTRS ((coding)->id); \
665 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
666 } while (0)
668 static void
669 CHECK_NATNUM_CAR (Lisp_Object x)
671 Lisp_Object tmp = XCAR (x);
672 CHECK_NATNUM (tmp);
673 XSETCAR (x, tmp);
676 static void
677 CHECK_NATNUM_CDR (Lisp_Object x)
679 Lisp_Object tmp = XCDR (x);
680 CHECK_NATNUM (tmp);
681 XSETCDR (x, tmp);
684 /* True if CODING's destination can be grown. */
686 static bool
687 growable_destination (struct coding_system *coding)
689 return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
693 /* Safely get one byte from the source text pointed by SRC which ends
694 at SRC_END, and set C to that byte. If there are not enough bytes
695 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
696 and a multibyte character is found at SRC, set C to the
697 negative value of the character code. The caller should declare
698 and set these variables appropriately in advance:
699 src, src_end, multibytep */
701 #define ONE_MORE_BYTE(c) \
702 do { \
703 if (src == src_end) \
705 if (src_base < src) \
706 record_conversion_result \
707 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
708 goto no_more_source; \
710 c = *src++; \
711 if (multibytep && (c & 0x80)) \
713 if ((c & 0xFE) == 0xC0) \
714 c = ((c & 1) << 6) | *src++; \
715 else \
717 src--; \
718 c = - string_char (src, &src, NULL); \
719 record_conversion_result \
720 (coding, CODING_RESULT_INVALID_SRC); \
723 consumed_chars++; \
724 } while (0)
726 /* Safely get two bytes from the source text pointed by SRC which ends
727 at SRC_END, and set C1 and C2 to those bytes while skipping the
728 heading multibyte characters. If there are not enough bytes in the
729 source, it jumps to 'no_more_source'. If MULTIBYTEP and
730 a multibyte character is found for C2, set C2 to the negative value
731 of the character code. The caller should declare and set these
732 variables appropriately in advance:
733 src, src_end, multibytep
734 It is intended that this macro is used in detect_coding_utf_16. */
736 #define TWO_MORE_BYTES(c1, c2) \
737 do { \
738 do { \
739 if (src == src_end) \
740 goto no_more_source; \
741 c1 = *src++; \
742 if (multibytep && (c1 & 0x80)) \
744 if ((c1 & 0xFE) == 0xC0) \
745 c1 = ((c1 & 1) << 6) | *src++; \
746 else \
748 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
749 c1 = -1; \
752 } while (c1 < 0); \
753 if (src == src_end) \
754 goto no_more_source; \
755 c2 = *src++; \
756 if (multibytep && (c2 & 0x80)) \
758 if ((c2 & 0xFE) == 0xC0) \
759 c2 = ((c2 & 1) << 6) | *src++; \
760 else \
761 c2 = -1; \
763 } while (0)
766 /* Store a byte C in the place pointed by DST and increment DST to the
767 next free point, and increment PRODUCED_CHARS. The caller should
768 assure that C is 0..127, and declare and set the variable `dst'
769 appropriately in advance.
773 #define EMIT_ONE_ASCII_BYTE(c) \
774 do { \
775 produced_chars++; \
776 *dst++ = (c); \
777 } while (0)
780 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
782 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
783 do { \
784 produced_chars += 2; \
785 *dst++ = (c1), *dst++ = (c2); \
786 } while (0)
789 /* Store a byte C in the place pointed by DST and increment DST to the
790 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
791 store in an appropriate multibyte form. The caller should
792 declare and set the variables `dst' and `multibytep' appropriately
793 in advance. */
795 #define EMIT_ONE_BYTE(c) \
796 do { \
797 produced_chars++; \
798 if (multibytep) \
800 unsigned ch = (c); \
801 if (ch >= 0x80) \
802 ch = BYTE8_TO_CHAR (ch); \
803 CHAR_STRING_ADVANCE (ch, dst); \
805 else \
806 *dst++ = (c); \
807 } while (0)
810 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
812 #define EMIT_TWO_BYTES(c1, c2) \
813 do { \
814 produced_chars += 2; \
815 if (multibytep) \
817 unsigned ch; \
819 ch = (c1); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 ch = (c2); \
824 if (ch >= 0x80) \
825 ch = BYTE8_TO_CHAR (ch); \
826 CHAR_STRING_ADVANCE (ch, dst); \
828 else \
830 *dst++ = (c1); \
831 *dst++ = (c2); \
833 } while (0)
836 #define EMIT_THREE_BYTES(c1, c2, c3) \
837 do { \
838 EMIT_ONE_BYTE (c1); \
839 EMIT_TWO_BYTES (c2, c3); \
840 } while (0)
843 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
844 do { \
845 EMIT_TWO_BYTES (c1, c2); \
846 EMIT_TWO_BYTES (c3, c4); \
847 } while (0)
850 static void
851 record_conversion_result (struct coding_system *coding,
852 enum coding_result_code result)
854 coding->result = result;
855 switch (result)
857 case CODING_RESULT_INSUFFICIENT_SRC:
858 Vlast_code_conversion_error = Qinsufficient_source;
859 break;
860 case CODING_RESULT_INVALID_SRC:
861 Vlast_code_conversion_error = Qinvalid_source;
862 break;
863 case CODING_RESULT_INTERRUPT:
864 Vlast_code_conversion_error = Qinterrupted;
865 break;
866 case CODING_RESULT_INSUFFICIENT_DST:
867 /* Don't record this error in Vlast_code_conversion_error
868 because it happens just temporarily and is resolved when the
869 whole conversion is finished. */
870 break;
871 case CODING_RESULT_SUCCESS:
872 break;
873 default:
874 Vlast_code_conversion_error = intern ("Unknown error");
878 /* These wrapper macros are used to preserve validity of pointers into
879 buffer text across calls to decode_char, encode_char, etc, which
880 could cause relocation of buffers if it loads a charset map,
881 because loading a charset map allocates large structures. */
883 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
884 do { \
885 ptrdiff_t offset; \
887 charset_map_loaded = 0; \
888 c = DECODE_CHAR (charset, code); \
889 if (charset_map_loaded \
890 && (offset = coding_change_source (coding))) \
892 src += offset; \
893 src_base += offset; \
894 src_end += offset; \
896 } while (0)
898 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
899 do { \
900 ptrdiff_t offset; \
902 charset_map_loaded = 0; \
903 code = ENCODE_CHAR (charset, c); \
904 if (charset_map_loaded \
905 && (offset = coding_change_destination (coding))) \
907 dst += offset; \
908 dst_end += offset; \
910 } while (0)
912 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
913 do { \
914 ptrdiff_t offset; \
916 charset_map_loaded = 0; \
917 charset = char_charset (c, charset_list, code_return); \
918 if (charset_map_loaded \
919 && (offset = coding_change_destination (coding))) \
921 dst += offset; \
922 dst_end += offset; \
924 } while (0)
926 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
927 do { \
928 ptrdiff_t offset; \
930 charset_map_loaded = 0; \
931 result = CHAR_CHARSET_P (c, charset); \
932 if (charset_map_loaded \
933 && (offset = coding_change_destination (coding))) \
935 dst += offset; \
936 dst_end += offset; \
938 } while (0)
941 /* If there are at least BYTES length of room at dst, allocate memory
942 for coding->destination and update dst and dst_end. We don't have
943 to take care of coding->source which will be relocated. It is
944 handled by calling coding_set_source in encode_coding. */
946 #define ASSURE_DESTINATION(bytes) \
947 do { \
948 if (dst + (bytes) >= dst_end) \
950 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
952 dst = alloc_destination (coding, more_bytes, dst); \
953 dst_end = coding->destination + coding->dst_bytes; \
955 } while (0)
958 /* Store multibyte form of the character C in P, and advance P to the
959 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
960 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
961 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
963 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
965 /* Return the character code of character whose multibyte form is at
966 P, and advance P to the end of the multibyte form. This used to be
967 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
968 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
970 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
972 /* Set coding->source from coding->src_object. */
974 static void
975 coding_set_source (struct coding_system *coding)
977 if (BUFFERP (coding->src_object))
979 struct buffer *buf = XBUFFER (coding->src_object);
981 if (coding->src_pos < 0)
982 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
983 else
984 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
986 else if (STRINGP (coding->src_object))
988 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
990 else
992 /* Otherwise, the source is C string and is never relocated
993 automatically. Thus we don't have to update anything. */
998 /* Set coding->source from coding->src_object, and return how many
999 bytes coding->source was changed. */
1001 static ptrdiff_t
1002 coding_change_source (struct coding_system *coding)
1004 const unsigned char *orig = coding->source;
1005 coding_set_source (coding);
1006 return coding->source - orig;
1010 /* Set coding->destination from coding->dst_object. */
1012 static void
1013 coding_set_destination (struct coding_system *coding)
1015 if (BUFFERP (coding->dst_object))
1017 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1020 coding->dst_bytes = (GAP_END_ADDR
1021 - (coding->src_bytes - coding->consumed)
1022 - coding->destination);
1024 else
1026 /* We are sure that coding->dst_pos_byte is before the gap
1027 of the buffer. */
1028 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1029 + coding->dst_pos_byte - BEG_BYTE);
1030 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1031 - coding->destination);
1034 else
1036 /* Otherwise, the destination is C string and is never relocated
1037 automatically. Thus we don't have to update anything. */
1042 /* Set coding->destination from coding->dst_object, and return how
1043 many bytes coding->destination was changed. */
1045 static ptrdiff_t
1046 coding_change_destination (struct coding_system *coding)
1048 const unsigned char *orig = coding->destination;
1049 coding_set_destination (coding);
1050 return coding->destination - orig;
1054 static void
1055 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1058 string_overflow ();
1059 coding->destination = xrealloc (coding->destination,
1060 coding->dst_bytes + bytes);
1061 coding->dst_bytes += bytes;
1064 static void
1065 coding_alloc_by_making_gap (struct coding_system *coding,
1066 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 if (EQ (coding->src_object, coding->dst_object))
1070 /* The gap may contain the produced data at the head and not-yet
1071 consumed data at the tail. To preserve those data, we at
1072 first make the gap size to zero, then increase the gap
1073 size. */
1074 ptrdiff_t add = GAP_SIZE;
1076 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1077 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1078 make_gap (bytes);
1079 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1080 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082 else
1083 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1087 static unsigned char *
1088 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1089 unsigned char *dst)
1091 ptrdiff_t offset = dst - coding->destination;
1093 if (BUFFERP (coding->dst_object))
1095 struct buffer *buf = XBUFFER (coding->dst_object);
1097 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099 else
1100 coding_alloc_by_realloc (coding, nbytes);
1101 coding_set_destination (coding);
1102 dst = coding->destination + offset;
1103 return dst;
1106 /** Macros for annotations. */
1108 /* An annotation data is stored in the array coding->charbuf in this
1109 format:
1110 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1111 LENGTH is the number of elements in the annotation.
1112 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1113 NCHARS is the number of characters in the text annotated.
1115 The format of the following elements depend on ANNOTATION_MASK.
1117 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1118 follows:
1119 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121 NBYTES is the number of bytes specified in the header part of
1122 old-style emacs-mule encoding, or 0 for the other kind of
1123 composition.
1125 METHOD is one of enum composition_method.
1127 Optional COMPOSITION-COMPONENTS are characters and composition
1128 rules.
1130 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1131 follows.
1133 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1134 recover from an invalid annotation, and should be skipped by
1135 produce_annotation. */
1137 /* Maximum length of the header of annotation data. */
1138 #define MAX_ANNOTATION_LENGTH 5
1140 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1141 do { \
1142 *(buf)++ = -(len); \
1143 *(buf)++ = (mask); \
1144 *(buf)++ = (nchars); \
1145 coding->annotated = 1; \
1146 } while (0);
1148 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1149 do { \
1150 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1151 *buf++ = nbytes; \
1152 *buf++ = method; \
1153 } while (0)
1156 #define ADD_CHARSET_DATA(buf, nchars, id) \
1157 do { \
1158 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1159 *buf++ = id; \
1160 } while (0)
1163 /* Bitmasks for coding->eol_seen. */
1165 #define EOL_SEEN_NONE 0
1166 #define EOL_SEEN_LF 1
1167 #define EOL_SEEN_CR 2
1168 #define EOL_SEEN_CRLF 4
1171 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1176 /*** 3. UTF-8 ***/
1178 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1179 Return true if a text is encoded in UTF-8. */
1181 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1182 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1183 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1184 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1185 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1186 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188 #define UTF_8_BOM_1 0xEF
1189 #define UTF_8_BOM_2 0xBB
1190 #define UTF_8_BOM_3 0xBF
1192 /* Unlike the other detect_coding_XXX, this function counts the number
1193 of characters and checks the EOL format. */
1195 static bool
1196 detect_coding_utf_8 (struct coding_system *coding,
1197 struct coding_detection_info *detect_info)
1199 const unsigned char *src = coding->source, *src_base;
1200 const unsigned char *src_end = coding->source + coding->src_bytes;
1201 bool multibytep = coding->src_multibyte;
1202 ptrdiff_t consumed_chars = 0;
1203 bool bom_found = 0;
1204 ptrdiff_t nchars = coding->head_ascii;
1205 int eol_seen = coding->eol_seen;
1207 detect_info->checked |= CATEGORY_MASK_UTF_8;
1208 /* A coding system of this category is always ASCII compatible. */
1209 src += nchars;
1211 if (src == coding->source /* BOM should be at the head. */
1212 && src + 3 < src_end /* BOM is 3-byte long. */
1213 && src[0] == UTF_8_BOM_1
1214 && src[1] == UTF_8_BOM_2
1215 && src[2] == UTF_8_BOM_3)
1217 bom_found = 1;
1218 src += 3;
1219 nchars++;
1222 while (1)
1224 int c, c1, c2, c3, c4;
1226 src_base = src;
1227 ONE_MORE_BYTE (c);
1228 if (c < 0 || UTF_8_1_OCTET_P (c))
1230 nchars++;
1231 if (c == '\r')
1233 if (src < src_end && *src == '\n')
1235 eol_seen |= EOL_SEEN_CRLF;
1236 src++;
1237 nchars++;
1239 else
1240 eol_seen |= EOL_SEEN_CR;
1242 else if (c == '\n')
1243 eol_seen |= EOL_SEEN_LF;
1244 continue;
1246 ONE_MORE_BYTE (c1);
1247 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1248 break;
1249 if (UTF_8_2_OCTET_LEADING_P (c))
1251 nchars++;
1252 continue;
1254 ONE_MORE_BYTE (c2);
1255 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1256 break;
1257 if (UTF_8_3_OCTET_LEADING_P (c))
1259 nchars++;
1260 continue;
1262 ONE_MORE_BYTE (c3);
1263 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1264 break;
1265 if (UTF_8_4_OCTET_LEADING_P (c))
1267 nchars++;
1268 continue;
1270 ONE_MORE_BYTE (c4);
1271 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1272 break;
1273 if (UTF_8_5_OCTET_LEADING_P (c))
1275 nchars++;
1276 continue;
1278 break;
1280 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1281 return 0;
1283 no_more_source:
1284 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1287 return 0;
1289 if (bom_found)
1291 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1292 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294 else
1296 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1297 if (nchars < src_end - coding->source)
1298 /* The found characters are less than source bytes, which
1299 means that we found a valid non-ASCII characters. */
1300 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302 coding->detected_utf8_bytes = src_base - coding->source;
1303 coding->detected_utf8_chars = nchars;
1304 return 1;
1308 static void
1309 decode_coding_utf_8 (struct coding_system *coding)
1311 const unsigned char *src = coding->source + coding->consumed;
1312 const unsigned char *src_end = coding->source + coding->src_bytes;
1313 const unsigned char *src_base;
1314 int *charbuf = coding->charbuf + coding->charbuf_used;
1315 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1316 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1317 bool multibytep = coding->src_multibyte;
1318 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1319 bool eol_dos
1320 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1321 int byte_after_cr = -1;
1323 if (bom != utf_without_bom)
1325 int c1, c2, c3;
1327 src_base = src;
1328 ONE_MORE_BYTE (c1);
1329 if (! UTF_8_3_OCTET_LEADING_P (c1))
1330 src = src_base;
1331 else
1333 ONE_MORE_BYTE (c2);
1334 if (! UTF_8_EXTRA_OCTET_P (c2))
1335 src = src_base;
1336 else
1338 ONE_MORE_BYTE (c3);
1339 if (! UTF_8_EXTRA_OCTET_P (c3))
1340 src = src_base;
1341 else
1343 if ((c1 != UTF_8_BOM_1)
1344 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1345 src = src_base;
1346 else
1347 CODING_UTF_8_BOM (coding) = utf_without_bom;
1352 CODING_UTF_8_BOM (coding) = utf_without_bom;
1354 while (1)
1356 int c, c1, c2, c3, c4, c5;
1358 src_base = src;
1359 consumed_chars_base = consumed_chars;
1361 if (charbuf >= charbuf_end)
1363 if (byte_after_cr >= 0)
1364 src_base--;
1365 break;
1368 /* In the simple case, rapidly handle ordinary characters */
1369 if (multibytep && ! eol_dos
1370 && charbuf < charbuf_end - 6 && src < src_end - 6)
1372 while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374 c1 = *src;
1375 if (c1 & 0x80)
1376 break;
1377 src++;
1378 consumed_chars++;
1379 *charbuf++ = c1;
1381 c1 = *src;
1382 if (c1 & 0x80)
1383 break;
1384 src++;
1385 consumed_chars++;
1386 *charbuf++ = c1;
1388 c1 = *src;
1389 if (c1 & 0x80)
1390 break;
1391 src++;
1392 consumed_chars++;
1393 *charbuf++ = c1;
1395 c1 = *src;
1396 if (c1 & 0x80)
1397 break;
1398 src++;
1399 consumed_chars++;
1400 *charbuf++ = c1;
1402 /* If we handled at least one character, restart the main loop. */
1403 if (src != src_base)
1404 continue;
1407 if (byte_after_cr >= 0)
1408 c1 = byte_after_cr, byte_after_cr = -1;
1409 else
1410 ONE_MORE_BYTE (c1);
1411 if (c1 < 0)
1413 c = - c1;
1415 else if (UTF_8_1_OCTET_P (c1))
1417 if (eol_dos && c1 == '\r')
1418 ONE_MORE_BYTE (byte_after_cr);
1419 c = c1;
1421 else
1423 ONE_MORE_BYTE (c2);
1424 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1425 goto invalid_code;
1426 if (UTF_8_2_OCTET_LEADING_P (c1))
1428 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1429 /* Reject overlong sequences here and below. Encoders
1430 producing them are incorrect, they can be misleading,
1431 and they mess up read/write invariance. */
1432 if (c < 128)
1433 goto invalid_code;
1435 else
1437 ONE_MORE_BYTE (c3);
1438 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1439 goto invalid_code;
1440 if (UTF_8_3_OCTET_LEADING_P (c1))
1442 c = (((c1 & 0xF) << 12)
1443 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1444 if (c < 0x800
1445 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1446 goto invalid_code;
1448 else
1450 ONE_MORE_BYTE (c4);
1451 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1452 goto invalid_code;
1453 if (UTF_8_4_OCTET_LEADING_P (c1))
1455 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1456 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1457 if (c < 0x10000)
1458 goto invalid_code;
1460 else
1462 ONE_MORE_BYTE (c5);
1463 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1464 goto invalid_code;
1465 if (UTF_8_5_OCTET_LEADING_P (c1))
1467 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1468 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1469 | (c5 & 0x3F));
1470 if ((c > MAX_CHAR) || (c < 0x200000))
1471 goto invalid_code;
1473 else
1474 goto invalid_code;
1480 *charbuf++ = c;
1481 continue;
1483 invalid_code:
1484 src = src_base;
1485 consumed_chars = consumed_chars_base;
1486 ONE_MORE_BYTE (c);
1487 *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1490 no_more_source:
1491 coding->consumed_char += consumed_chars_base;
1492 coding->consumed = src_base - coding->source;
1493 coding->charbuf_used = charbuf - coding->charbuf;
1497 static bool
1498 encode_coding_utf_8 (struct coding_system *coding)
1500 bool multibytep = coding->dst_multibyte;
1501 int *charbuf = coding->charbuf;
1502 int *charbuf_end = charbuf + coding->charbuf_used;
1503 unsigned char *dst = coding->destination + coding->produced;
1504 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1505 ptrdiff_t produced_chars = 0;
1506 int c;
1508 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1510 ASSURE_DESTINATION (3);
1511 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1512 CODING_UTF_8_BOM (coding) = utf_without_bom;
1515 if (multibytep)
1517 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1519 while (charbuf < charbuf_end)
1521 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1523 ASSURE_DESTINATION (safe_room);
1524 c = *charbuf++;
1525 if (CHAR_BYTE8_P (c))
1527 c = CHAR_TO_BYTE8 (c);
1528 EMIT_ONE_BYTE (c);
1530 else
1532 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1533 for (p = str; p < pend; p++)
1534 EMIT_ONE_BYTE (*p);
1538 else
1540 int safe_room = MAX_MULTIBYTE_LENGTH;
1542 while (charbuf < charbuf_end)
1544 ASSURE_DESTINATION (safe_room);
1545 c = *charbuf++;
1546 if (CHAR_BYTE8_P (c))
1547 *dst++ = CHAR_TO_BYTE8 (c);
1548 else
1549 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1551 produced_chars = dst - (coding->destination + coding->produced);
1553 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1554 coding->produced_char += produced_chars;
1555 coding->produced = dst - coding->destination;
1556 return 0;
1560 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1561 Return true if a text is encoded in one of UTF-16 based coding systems. */
1563 #define UTF_16_HIGH_SURROGATE_P(val) \
1564 (((val) & 0xFC00) == 0xD800)
1566 #define UTF_16_LOW_SURROGATE_P(val) \
1567 (((val) & 0xFC00) == 0xDC00)
1570 static bool
1571 detect_coding_utf_16 (struct coding_system *coding,
1572 struct coding_detection_info *detect_info)
1574 const unsigned char *src = coding->source;
1575 const unsigned char *src_end = coding->source + coding->src_bytes;
1576 bool multibytep = coding->src_multibyte;
1577 int c1, c2;
1579 detect_info->checked |= CATEGORY_MASK_UTF_16;
1580 if (coding->mode & CODING_MODE_LAST_BLOCK
1581 && (coding->src_chars & 1))
1583 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1584 return 0;
1587 TWO_MORE_BYTES (c1, c2);
1588 if ((c1 == 0xFF) && (c2 == 0xFE))
1590 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1591 | CATEGORY_MASK_UTF_16_AUTO);
1592 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1593 | CATEGORY_MASK_UTF_16_BE_NOSIG
1594 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1596 else if ((c1 == 0xFE) && (c2 == 0xFF))
1598 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1599 | CATEGORY_MASK_UTF_16_AUTO);
1600 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1601 | CATEGORY_MASK_UTF_16_BE_NOSIG
1602 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1604 else if (c2 < 0)
1606 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1607 return 0;
1609 else
1611 /* We check the dispersion of Eth and Oth bytes where E is even and
1612 O is odd. If both are high, we assume binary data.*/
1613 unsigned char e[256], o[256];
1614 unsigned e_num = 1, o_num = 1;
1616 memset (e, 0, 256);
1617 memset (o, 0, 256);
1618 e[c1] = 1;
1619 o[c2] = 1;
1621 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1622 |CATEGORY_MASK_UTF_16_BE
1623 | CATEGORY_MASK_UTF_16_LE);
1625 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1626 != CATEGORY_MASK_UTF_16)
1628 TWO_MORE_BYTES (c1, c2);
1629 if (c2 < 0)
1630 break;
1631 if (! e[c1])
1633 e[c1] = 1;
1634 e_num++;
1635 if (e_num >= 128)
1636 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1638 if (! o[c2])
1640 o[c2] = 1;
1641 o_num++;
1642 if (o_num >= 128)
1643 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646 return 0;
1649 no_more_source:
1650 return 1;
1653 static void
1654 decode_coding_utf_16 (struct coding_system *coding)
1656 const unsigned char *src = coding->source + coding->consumed;
1657 const unsigned char *src_end = coding->source + coding->src_bytes;
1658 const unsigned char *src_base;
1659 int *charbuf = coding->charbuf + coding->charbuf_used;
1660 /* We may produces at most 3 chars in one loop. */
1661 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1662 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1663 bool multibytep = coding->src_multibyte;
1664 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1665 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1666 int surrogate = CODING_UTF_16_SURROGATE (coding);
1667 bool eol_dos
1668 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1669 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1671 if (bom == utf_with_bom)
1673 int c, c1, c2;
1675 src_base = src;
1676 ONE_MORE_BYTE (c1);
1677 ONE_MORE_BYTE (c2);
1678 c = (c1 << 8) | c2;
1680 if (endian == utf_16_big_endian
1681 ? c != 0xFEFF : c != 0xFFFE)
1683 /* The first two bytes are not BOM. Treat them as bytes
1684 for a normal character. */
1685 src = src_base;
1687 CODING_UTF_16_BOM (coding) = utf_without_bom;
1689 else if (bom == utf_detect_bom)
1691 /* We have already tried to detect BOM and failed in
1692 detect_coding. */
1693 CODING_UTF_16_BOM (coding) = utf_without_bom;
1696 while (1)
1698 int c, c1, c2;
1700 src_base = src;
1701 consumed_chars_base = consumed_chars;
1703 if (charbuf >= charbuf_end)
1705 if (byte_after_cr1 >= 0)
1706 src_base -= 2;
1707 break;
1710 if (byte_after_cr1 >= 0)
1711 c1 = byte_after_cr1, byte_after_cr1 = -1;
1712 else
1713 ONE_MORE_BYTE (c1);
1714 if (c1 < 0)
1716 *charbuf++ = -c1;
1717 continue;
1719 if (byte_after_cr2 >= 0)
1720 c2 = byte_after_cr2, byte_after_cr2 = -1;
1721 else
1722 ONE_MORE_BYTE (c2);
1723 if (c2 < 0)
1725 *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1726 *charbuf++ = -c2;
1727 continue;
1729 c = (endian == utf_16_big_endian
1730 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1732 if (surrogate)
1734 if (! UTF_16_LOW_SURROGATE_P (c))
1736 if (endian == utf_16_big_endian)
1737 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1738 else
1739 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1740 *charbuf++ = c1;
1741 *charbuf++ = c2;
1742 if (UTF_16_HIGH_SURROGATE_P (c))
1743 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1744 else
1745 *charbuf++ = c;
1747 else
1749 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1750 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1751 *charbuf++ = 0x10000 + c;
1754 else
1756 if (UTF_16_HIGH_SURROGATE_P (c))
1757 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1758 else
1760 if (eol_dos && c == '\r')
1762 ONE_MORE_BYTE (byte_after_cr1);
1763 ONE_MORE_BYTE (byte_after_cr2);
1765 *charbuf++ = c;
1770 no_more_source:
1771 coding->consumed_char += consumed_chars_base;
1772 coding->consumed = src_base - coding->source;
1773 coding->charbuf_used = charbuf - coding->charbuf;
1776 static bool
1777 encode_coding_utf_16 (struct coding_system *coding)
1779 bool multibytep = coding->dst_multibyte;
1780 int *charbuf = coding->charbuf;
1781 int *charbuf_end = charbuf + coding->charbuf_used;
1782 unsigned char *dst = coding->destination + coding->produced;
1783 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1784 int safe_room = 8;
1785 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1786 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1787 ptrdiff_t produced_chars = 0;
1788 int c;
1790 if (bom != utf_without_bom)
1792 ASSURE_DESTINATION (safe_room);
1793 if (big_endian)
1794 EMIT_TWO_BYTES (0xFE, 0xFF);
1795 else
1796 EMIT_TWO_BYTES (0xFF, 0xFE);
1797 CODING_UTF_16_BOM (coding) = utf_without_bom;
1800 while (charbuf < charbuf_end)
1802 ASSURE_DESTINATION (safe_room);
1803 c = *charbuf++;
1804 if (c > MAX_UNICODE_CHAR)
1805 c = coding->default_char;
1807 if (c < 0x10000)
1809 if (big_endian)
1810 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1811 else
1812 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1814 else
1816 int c1, c2;
1818 c -= 0x10000;
1819 c1 = (c >> 10) + 0xD800;
1820 c2 = (c & 0x3FF) + 0xDC00;
1821 if (big_endian)
1822 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1823 else
1824 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1827 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1828 coding->produced = dst - coding->destination;
1829 coding->produced_char += produced_chars;
1830 return 0;
1834 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1836 /* Emacs' internal format for representation of multiple character
1837 sets is a kind of multi-byte encoding, i.e. characters are
1838 represented by variable-length sequences of one-byte codes.
1840 ASCII characters and control characters (e.g. `tab', `newline') are
1841 represented by one-byte sequences which are their ASCII codes, in
1842 the range 0x00 through 0x7F.
1844 8-bit characters of the range 0x80..0x9F are represented by
1845 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1846 code + 0x20).
1848 8-bit characters of the range 0xA0..0xFF are represented by
1849 one-byte sequences which are their 8-bit code.
1851 The other characters are represented by a sequence of `base
1852 leading-code', optional `extended leading-code', and one or two
1853 `position-code's. The length of the sequence is determined by the
1854 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1855 whereas extended leading-code and position-code take the range 0xA0
1856 through 0xFF. See `charset.h' for more details about leading-code
1857 and position-code.
1859 --- CODE RANGE of Emacs' internal format ---
1860 character set range
1861 ------------- -----
1862 ascii 0x00..0x7F
1863 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1864 eight-bit-graphic 0xA0..0xBF
1865 ELSE 0x81..0x9D + [0xA0..0xFF]+
1866 ---------------------------------------------
1868 As this is the internal character representation, the format is
1869 usually not used externally (i.e. in a file or in a data sent to a
1870 process). But, it is possible to have a text externally in this
1871 format (i.e. by encoding by the coding system `emacs-mule').
1873 In that case, a sequence of one-byte codes has a slightly different
1874 form.
1876 At first, all characters in eight-bit-control are represented by
1877 one-byte sequences which are their 8-bit code.
1879 Next, character composition data are represented by the byte
1880 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1881 where,
1882 METHOD is 0xF2 plus one of composition method (enum
1883 composition_method),
1885 BYTES is 0xA0 plus a byte length of this composition data,
1887 CHARS is 0xA0 plus a number of characters composed by this
1888 data,
1890 COMPONENTs are characters of multibyte form or composition
1891 rules encoded by two-byte of ASCII codes.
1893 In addition, for backward compatibility, the following formats are
1894 also recognized as composition data on decoding.
1896 0x80 MSEQ ...
1897 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1899 Here,
1900 MSEQ is a multibyte form but in these special format:
1901 ASCII: 0xA0 ASCII_CODE+0x80,
1902 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1903 RULE is a one byte code of the range 0xA0..0xF0 that
1904 represents a composition rule.
1907 char emacs_mule_bytes[256];
1910 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1911 Return true if a text is encoded in 'emacs-mule'. */
1913 static bool
1914 detect_coding_emacs_mule (struct coding_system *coding,
1915 struct coding_detection_info *detect_info)
1917 const unsigned char *src = coding->source, *src_base;
1918 const unsigned char *src_end = coding->source + coding->src_bytes;
1919 bool multibytep = coding->src_multibyte;
1920 ptrdiff_t consumed_chars = 0;
1921 int c;
1922 int found = 0;
1924 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1925 /* A coding system of this category is always ASCII compatible. */
1926 src += coding->head_ascii;
1928 while (1)
1930 src_base = src;
1931 ONE_MORE_BYTE (c);
1932 if (c < 0)
1933 continue;
1934 if (c == 0x80)
1936 /* Perhaps the start of composite character. We simply skip
1937 it because analyzing it is too heavy for detecting. But,
1938 at least, we check that the composite character
1939 constitutes of more than 4 bytes. */
1940 const unsigned char *src_start;
1942 repeat:
1943 src_start = src;
1946 ONE_MORE_BYTE (c);
1948 while (c >= 0xA0);
1950 if (src - src_start <= 4)
1951 break;
1952 found = CATEGORY_MASK_EMACS_MULE;
1953 if (c == 0x80)
1954 goto repeat;
1957 if (c < 0x80)
1959 if (c < 0x20
1960 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1961 break;
1963 else
1965 int more_bytes = emacs_mule_bytes[c] - 1;
1967 while (more_bytes > 0)
1969 ONE_MORE_BYTE (c);
1970 if (c < 0xA0)
1972 src--; /* Unread the last byte. */
1973 break;
1975 more_bytes--;
1977 if (more_bytes != 0)
1978 break;
1979 found = CATEGORY_MASK_EMACS_MULE;
1982 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1983 return 0;
1985 no_more_source:
1986 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1988 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1989 return 0;
1991 detect_info->found |= found;
1992 return 1;
1996 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1997 character. If CMP_STATUS indicates that we must expect MSEQ or
1998 RULE described above, decode it and return the negative value of
1999 the decoded character or rule. If an invalid byte is found, return
2000 -1. If SRC is too short, return -2. */
2002 static int
2003 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2004 int *nbytes, int *nchars, int *id,
2005 struct composition_status *cmp_status)
2007 const unsigned char *src_end = coding->source + coding->src_bytes;
2008 const unsigned char *src_base = src;
2009 bool multibytep = coding->src_multibyte;
2010 int charset_ID;
2011 unsigned code;
2012 int c;
2013 ptrdiff_t consumed_chars = 0;
2014 bool mseq_found = 0;
2016 ONE_MORE_BYTE (c);
2017 if (c < 0)
2019 c = -c;
2020 charset_ID = emacs_mule_charset[0];
2022 else
2024 if (c >= 0xA0)
2026 if (cmp_status->state != COMPOSING_NO
2027 && cmp_status->old_form)
2029 if (cmp_status->state == COMPOSING_CHAR)
2031 if (c == 0xA0)
2033 ONE_MORE_BYTE (c);
2034 c -= 0x80;
2035 if (c < 0)
2036 goto invalid_code;
2038 else
2039 c -= 0x20;
2040 mseq_found = 1;
2042 else
2044 *nbytes = src - src_base;
2045 *nchars = consumed_chars;
2046 return -c;
2049 else
2050 goto invalid_code;
2053 switch (emacs_mule_bytes[c])
2055 case 2:
2056 if ((charset_ID = emacs_mule_charset[c]) < 0)
2057 goto invalid_code;
2058 ONE_MORE_BYTE (c);
2059 if (c < 0xA0)
2060 goto invalid_code;
2061 code = c & 0x7F;
2062 break;
2064 case 3:
2065 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2066 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2068 ONE_MORE_BYTE (c);
2069 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2070 goto invalid_code;
2071 ONE_MORE_BYTE (c);
2072 if (c < 0xA0)
2073 goto invalid_code;
2074 code = c & 0x7F;
2076 else
2078 if ((charset_ID = emacs_mule_charset[c]) < 0)
2079 goto invalid_code;
2080 ONE_MORE_BYTE (c);
2081 if (c < 0xA0)
2082 goto invalid_code;
2083 code = (c & 0x7F) << 8;
2084 ONE_MORE_BYTE (c);
2085 if (c < 0xA0)
2086 goto invalid_code;
2087 code |= c & 0x7F;
2089 break;
2091 case 4:
2092 ONE_MORE_BYTE (c);
2093 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2094 goto invalid_code;
2095 ONE_MORE_BYTE (c);
2096 if (c < 0xA0)
2097 goto invalid_code;
2098 code = (c & 0x7F) << 8;
2099 ONE_MORE_BYTE (c);
2100 if (c < 0xA0)
2101 goto invalid_code;
2102 code |= c & 0x7F;
2103 break;
2105 case 1:
2106 code = c;
2107 charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2108 break;
2110 default:
2111 emacs_abort ();
2113 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2114 CHARSET_FROM_ID (charset_ID), code, c);
2115 if (c < 0)
2116 goto invalid_code;
2118 *nbytes = src - src_base;
2119 *nchars = consumed_chars;
2120 if (id)
2121 *id = charset_ID;
2122 return (mseq_found ? -c : c);
2124 no_more_source:
2125 return -2;
2127 invalid_code:
2128 return -1;
2132 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2134 /* Handle these composition sequence ('|': the end of header elements,
2135 BYTES and CHARS >= 0xA0):
2137 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2138 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2139 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2141 and these old form:
2143 (4) relative composition: 0x80 | MSEQ ... MSEQ
2144 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2146 When the starter 0x80 and the following header elements are found,
2147 this annotation header is produced.
2149 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2151 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2152 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2154 Then, upon reading the following elements, these codes are produced
2155 until the composition end is found:
2157 (1) CHAR ... CHAR
2158 (2) ALT ... ALT CHAR ... CHAR
2159 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2160 (4) CHAR ... CHAR
2161 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2163 When the composition end is found, LENGTH and NCHARS in the
2164 annotation header is updated as below:
2166 (1) LENGTH: unchanged, NCHARS: unchanged
2167 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2168 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2169 (4) LENGTH: unchanged, NCHARS: number of CHARs
2170 (5) LENGTH: unchanged, NCHARS: number of CHARs
2172 If an error is found while composing, the annotation header is
2173 changed to the original composition header (plus filler -1s) as
2174 below:
2176 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2177 (5) [ 0x80 0xFF -1 -1- -1 ]
2179 and the sequence [ -2 DECODED-RULE ] is changed to the original
2180 byte sequence as below:
2181 o the original byte sequence is B: [ B -1 ]
2182 o the original byte sequence is B1 B2: [ B1 B2 ]
2184 Most of the routines are implemented by macros because many
2185 variables and labels in the caller decode_coding_emacs_mule must be
2186 accessible, and they are usually called just once (thus doesn't
2187 increase the size of compiled object). */
2189 /* Decode a composition rule represented by C as a component of
2190 composition sequence of Emacs 20 style. Set RULE to the decoded
2191 rule. */
2193 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2194 do { \
2195 int gref, nref; \
2197 c -= 0xA0; \
2198 if (c < 0 || c >= 81) \
2199 goto invalid_code; \
2200 gref = c / 9, nref = c % 9; \
2201 if (gref == 4) gref = 10; \
2202 if (nref == 4) nref = 10; \
2203 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2204 } while (0)
2207 /* Decode a composition rule represented by C and the following byte
2208 at SRC as a component of composition sequence of Emacs 21 style.
2209 Set RULE to the decoded rule. */
2211 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2212 do { \
2213 int gref, nref; \
2215 gref = c - 0x20; \
2216 if (gref < 0 || gref >= 81) \
2217 goto invalid_code; \
2218 ONE_MORE_BYTE (c); \
2219 nref = c - 0x20; \
2220 if (nref < 0 || nref >= 81) \
2221 goto invalid_code; \
2222 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2223 } while (0)
2226 /* Start of Emacs 21 style format. The first three bytes at SRC are
2227 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2228 byte length of this composition information, CHARS is the number of
2229 characters composed by this composition. */
2231 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2232 do { \
2233 enum composition_method method = c - 0xF2; \
2234 int nbytes, nchars; \
2236 ONE_MORE_BYTE (c); \
2237 if (c < 0) \
2238 goto invalid_code; \
2239 nbytes = c - 0xA0; \
2240 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2241 goto invalid_code; \
2242 ONE_MORE_BYTE (c); \
2243 nchars = c - 0xA0; \
2244 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2245 goto invalid_code; \
2246 cmp_status->old_form = 0; \
2247 cmp_status->method = method; \
2248 if (method == COMPOSITION_RELATIVE) \
2249 cmp_status->state = COMPOSING_CHAR; \
2250 else \
2251 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2252 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2253 cmp_status->nchars = nchars; \
2254 cmp_status->ncomps = nbytes - 4; \
2255 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2256 } while (0)
2259 /* Start of Emacs 20 style format for relative composition. */
2261 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2262 do { \
2263 cmp_status->old_form = 1; \
2264 cmp_status->method = COMPOSITION_RELATIVE; \
2265 cmp_status->state = COMPOSING_CHAR; \
2266 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2267 cmp_status->nchars = cmp_status->ncomps = 0; \
2268 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2269 } while (0)
2272 /* Start of Emacs 20 style format for rule-base composition. */
2274 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2275 do { \
2276 cmp_status->old_form = 1; \
2277 cmp_status->method = COMPOSITION_WITH_RULE; \
2278 cmp_status->state = COMPOSING_CHAR; \
2279 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2280 cmp_status->nchars = cmp_status->ncomps = 0; \
2281 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2282 } while (0)
2285 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2286 do { \
2287 const unsigned char *current_src = src; \
2289 ONE_MORE_BYTE (c); \
2290 if (c < 0) \
2291 goto invalid_code; \
2292 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2293 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2294 DECODE_EMACS_MULE_21_COMPOSITION (); \
2295 else if (c < 0xA0) \
2296 goto invalid_code; \
2297 else if (c < 0xC0) \
2299 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2300 /* Re-read C as a composition component. */ \
2301 src = current_src; \
2303 else if (c == 0xFF) \
2304 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2305 else \
2306 goto invalid_code; \
2307 } while (0)
2309 #define EMACS_MULE_COMPOSITION_END() \
2310 do { \
2311 int idx = - cmp_status->length; \
2313 if (cmp_status->old_form) \
2314 charbuf[idx + 2] = cmp_status->nchars; \
2315 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2316 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2317 cmp_status->state = COMPOSING_NO; \
2318 } while (0)
2321 static int
2322 emacs_mule_finish_composition (int *charbuf,
2323 struct composition_status *cmp_status)
2325 int idx = - cmp_status->length;
2326 int new_chars;
2328 if (cmp_status->old_form && cmp_status->nchars > 0)
2330 charbuf[idx + 2] = cmp_status->nchars;
2331 new_chars = 0;
2332 if (cmp_status->method == COMPOSITION_WITH_RULE
2333 && cmp_status->state == COMPOSING_CHAR)
2335 /* The last rule was invalid. */
2336 int rule = charbuf[-1] + 0xA0;
2338 charbuf[-2] = BYTE8_TO_CHAR (rule);
2339 charbuf[-1] = -1;
2340 new_chars = 1;
2343 else
2345 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2347 if (cmp_status->method == COMPOSITION_WITH_RULE)
2349 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2350 charbuf[idx++] = -3;
2351 charbuf[idx++] = 0;
2352 new_chars = 1;
2354 else
2356 int nchars = charbuf[idx + 1] + 0xA0;
2357 int nbytes = charbuf[idx + 2] + 0xA0;
2359 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2360 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2361 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2362 charbuf[idx++] = -1;
2363 new_chars = 4;
2366 cmp_status->state = COMPOSING_NO;
2367 return new_chars;
2370 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2371 do { \
2372 if (cmp_status->state != COMPOSING_NO) \
2373 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2374 } while (0)
2377 static void
2378 decode_coding_emacs_mule (struct coding_system *coding)
2380 const unsigned char *src = coding->source + coding->consumed;
2381 const unsigned char *src_end = coding->source + coding->src_bytes;
2382 const unsigned char *src_base;
2383 int *charbuf = coding->charbuf + coding->charbuf_used;
2384 /* We may produce two annotations (charset and composition) in one
2385 loop and one more charset annotation at the end. */
2386 int *charbuf_end
2387 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2388 /* We can produce up to 2 characters in a loop. */
2389 - 1;
2390 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2391 bool multibytep = coding->src_multibyte;
2392 ptrdiff_t char_offset = coding->produced_char;
2393 ptrdiff_t last_offset = char_offset;
2394 int last_id = charset_ascii;
2395 bool eol_dos
2396 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2397 int byte_after_cr = -1;
2398 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2400 if (cmp_status->state != COMPOSING_NO)
2402 int i;
2404 if (charbuf_end - charbuf < cmp_status->length)
2405 emacs_abort ();
2406 for (i = 0; i < cmp_status->length; i++)
2407 *charbuf++ = cmp_status->carryover[i];
2408 coding->annotated = 1;
2411 while (1)
2413 int c, id IF_LINT (= 0);
2415 src_base = src;
2416 consumed_chars_base = consumed_chars;
2418 if (charbuf >= charbuf_end)
2420 if (byte_after_cr >= 0)
2421 src_base--;
2422 break;
2425 if (byte_after_cr >= 0)
2426 c = byte_after_cr, byte_after_cr = -1;
2427 else
2428 ONE_MORE_BYTE (c);
2430 if (c < 0 || c == 0x80)
2432 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2433 if (c < 0)
2435 *charbuf++ = -c;
2436 char_offset++;
2438 else
2439 DECODE_EMACS_MULE_COMPOSITION_START ();
2440 continue;
2443 if (c < 0x80)
2445 if (eol_dos && c == '\r')
2446 ONE_MORE_BYTE (byte_after_cr);
2447 id = charset_ascii;
2448 if (cmp_status->state != COMPOSING_NO)
2450 if (cmp_status->old_form)
2451 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2452 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2453 cmp_status->ncomps--;
2456 else
2458 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2459 /* emacs_mule_char can load a charset map from a file, which
2460 allocates a large structure and might cause buffer text
2461 to be relocated as result. Thus, we need to remember the
2462 original pointer to buffer text, and fix up all related
2463 pointers after the call. */
2464 const unsigned char *orig = coding->source;
2465 ptrdiff_t offset;
2467 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2468 cmp_status);
2469 offset = coding->source - orig;
2470 if (offset)
2472 src += offset;
2473 src_base += offset;
2474 src_end += offset;
2476 if (c < 0)
2478 if (c == -1)
2479 goto invalid_code;
2480 if (c == -2)
2481 break;
2483 src = src_base + nbytes;
2484 consumed_chars = consumed_chars_base + nchars;
2485 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2486 cmp_status->ncomps -= nchars;
2489 /* Now if C >= 0, we found a normally encoded character, if C <
2490 0, we found an old-style composition component character or
2491 rule. */
2493 if (cmp_status->state == COMPOSING_NO)
2495 if (last_id != id)
2497 if (last_id != charset_ascii)
2498 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2499 last_id);
2500 last_id = id;
2501 last_offset = char_offset;
2503 *charbuf++ = c;
2504 char_offset++;
2506 else if (cmp_status->state == COMPOSING_CHAR)
2508 if (cmp_status->old_form)
2510 if (c >= 0)
2512 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513 *charbuf++ = c;
2514 char_offset++;
2516 else
2518 *charbuf++ = -c;
2519 cmp_status->nchars++;
2520 cmp_status->length++;
2521 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2522 EMACS_MULE_COMPOSITION_END ();
2523 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2524 cmp_status->state = COMPOSING_RULE;
2527 else
2529 *charbuf++ = c;
2530 cmp_status->length++;
2531 cmp_status->nchars--;
2532 if (cmp_status->nchars == 0)
2533 EMACS_MULE_COMPOSITION_END ();
2536 else if (cmp_status->state == COMPOSING_RULE)
2538 int rule;
2540 if (c >= 0)
2542 EMACS_MULE_COMPOSITION_END ();
2543 *charbuf++ = c;
2544 char_offset++;
2546 else
2548 c = -c;
2549 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2550 if (rule < 0)
2551 goto invalid_code;
2552 *charbuf++ = -2;
2553 *charbuf++ = rule;
2554 cmp_status->length += 2;
2555 cmp_status->state = COMPOSING_CHAR;
2558 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2560 *charbuf++ = c;
2561 cmp_status->length++;
2562 if (cmp_status->ncomps == 0)
2563 cmp_status->state = COMPOSING_CHAR;
2564 else if (cmp_status->ncomps > 0)
2566 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2567 cmp_status->state = COMPOSING_COMPONENT_RULE;
2569 else
2570 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2572 else /* COMPOSING_COMPONENT_RULE */
2574 int rule;
2576 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2577 if (rule < 0)
2578 goto invalid_code;
2579 *charbuf++ = -2;
2580 *charbuf++ = rule;
2581 cmp_status->length += 2;
2582 cmp_status->ncomps--;
2583 if (cmp_status->ncomps > 0)
2584 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2585 else
2586 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2588 continue;
2590 invalid_code:
2591 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2592 src = src_base;
2593 consumed_chars = consumed_chars_base;
2594 ONE_MORE_BYTE (c);
2595 *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2596 char_offset++;
2599 no_more_source:
2600 if (cmp_status->state != COMPOSING_NO)
2602 if (coding->mode & CODING_MODE_LAST_BLOCK)
2603 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2604 else
2606 int i;
2608 charbuf -= cmp_status->length;
2609 for (i = 0; i < cmp_status->length; i++)
2610 cmp_status->carryover[i] = charbuf[i];
2613 if (last_id != charset_ascii)
2614 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2615 coding->consumed_char += consumed_chars_base;
2616 coding->consumed = src_base - coding->source;
2617 coding->charbuf_used = charbuf - coding->charbuf;
2621 #define EMACS_MULE_LEADING_CODES(id, codes) \
2622 do { \
2623 if (id < 0xA0) \
2624 codes[0] = id, codes[1] = 0; \
2625 else if (id < 0xE0) \
2626 codes[0] = 0x9A, codes[1] = id; \
2627 else if (id < 0xF0) \
2628 codes[0] = 0x9B, codes[1] = id; \
2629 else if (id < 0xF5) \
2630 codes[0] = 0x9C, codes[1] = id; \
2631 else \
2632 codes[0] = 0x9D, codes[1] = id; \
2633 } while (0);
2636 static bool
2637 encode_coding_emacs_mule (struct coding_system *coding)
2639 bool multibytep = coding->dst_multibyte;
2640 int *charbuf = coding->charbuf;
2641 int *charbuf_end = charbuf + coding->charbuf_used;
2642 unsigned char *dst = coding->destination + coding->produced;
2643 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2644 int safe_room = 8;
2645 ptrdiff_t produced_chars = 0;
2646 Lisp_Object attrs, charset_list;
2647 int c;
2648 int preferred_charset_id = -1;
2650 CODING_GET_INFO (coding, attrs, charset_list);
2651 if (! EQ (charset_list, Vemacs_mule_charset_list))
2653 charset_list = Vemacs_mule_charset_list;
2654 ASET (attrs, coding_attr_charset_list, charset_list);
2657 while (charbuf < charbuf_end)
2659 ASSURE_DESTINATION (safe_room);
2660 c = *charbuf++;
2662 if (c < 0)
2664 /* Handle an annotation. */
2665 switch (*charbuf)
2667 case CODING_ANNOTATE_COMPOSITION_MASK:
2668 /* Not yet implemented. */
2669 break;
2670 case CODING_ANNOTATE_CHARSET_MASK:
2671 preferred_charset_id = charbuf[3];
2672 if (preferred_charset_id >= 0
2673 && NILP (Fmemq (make_number (preferred_charset_id),
2674 charset_list)))
2675 preferred_charset_id = -1;
2676 break;
2677 default:
2678 emacs_abort ();
2680 charbuf += -c - 1;
2681 continue;
2684 if (ASCII_CHAR_P (c))
2685 EMIT_ONE_ASCII_BYTE (c);
2686 else if (CHAR_BYTE8_P (c))
2688 c = CHAR_TO_BYTE8 (c);
2689 EMIT_ONE_BYTE (c);
2691 else
2693 struct charset *charset;
2694 unsigned code;
2695 int dimension;
2696 int emacs_mule_id;
2697 unsigned char leading_codes[2];
2699 if (preferred_charset_id >= 0)
2701 bool result;
2703 charset = CHARSET_FROM_ID (preferred_charset_id);
2704 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2705 if (result)
2706 code = ENCODE_CHAR (charset, c);
2707 else
2708 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2709 &code, charset);
2711 else
2712 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2713 &code, charset);
2714 if (! charset)
2716 c = coding->default_char;
2717 if (ASCII_CHAR_P (c))
2719 EMIT_ONE_ASCII_BYTE (c);
2720 continue;
2722 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2723 &code, charset);
2725 dimension = CHARSET_DIMENSION (charset);
2726 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2727 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2728 EMIT_ONE_BYTE (leading_codes[0]);
2729 if (leading_codes[1])
2730 EMIT_ONE_BYTE (leading_codes[1]);
2731 if (dimension == 1)
2732 EMIT_ONE_BYTE (code | 0x80);
2733 else
2735 code |= 0x8080;
2736 EMIT_ONE_BYTE (code >> 8);
2737 EMIT_ONE_BYTE (code & 0xFF);
2741 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2742 coding->produced_char += produced_chars;
2743 coding->produced = dst - coding->destination;
2744 return 0;
2748 /*** 7. ISO2022 handlers ***/
2750 /* The following note describes the coding system ISO2022 briefly.
2751 Since the intention of this note is to help understand the
2752 functions in this file, some parts are NOT ACCURATE or are OVERLY
2753 SIMPLIFIED. For thorough understanding, please refer to the
2754 original document of ISO2022. This is equivalent to the standard
2755 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2757 ISO2022 provides many mechanisms to encode several character sets
2758 in 7-bit and 8-bit environments. For 7-bit environments, all text
2759 is encoded using bytes less than 128. This may make the encoded
2760 text a little bit longer, but the text passes more easily through
2761 several types of gateway, some of which strip off the MSB (Most
2762 Significant Bit).
2764 There are two kinds of character sets: control character sets and
2765 graphic character sets. The former contain control characters such
2766 as `newline' and `escape' to provide control functions (control
2767 functions are also provided by escape sequences). The latter
2768 contain graphic characters such as 'A' and '-'. Emacs recognizes
2769 two control character sets and many graphic character sets.
2771 Graphic character sets are classified into one of the following
2772 four classes, according to the number of bytes (DIMENSION) and
2773 number of characters in one dimension (CHARS) of the set:
2774 - DIMENSION1_CHARS94
2775 - DIMENSION1_CHARS96
2776 - DIMENSION2_CHARS94
2777 - DIMENSION2_CHARS96
2779 In addition, each character set is assigned an identification tag,
2780 unique for each set, called the "final character" (denoted as <F>
2781 hereafter). The <F> of each character set is decided by ECMA(*)
2782 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2783 (0x30..0x3F are for private use only).
2785 Note (*): ECMA = European Computer Manufacturers Association
2787 Here are examples of graphic character sets [NAME(<F>)]:
2788 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2789 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2790 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2791 o DIMENSION2_CHARS96 -- none for the moment
2793 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2794 C0 [0x00..0x1F] -- control character plane 0
2795 GL [0x20..0x7F] -- graphic character plane 0
2796 C1 [0x80..0x9F] -- control character plane 1
2797 GR [0xA0..0xFF] -- graphic character plane 1
2799 A control character set is directly designated and invoked to C0 or
2800 C1 by an escape sequence. The most common case is that:
2801 - ISO646's control character set is designated/invoked to C0, and
2802 - ISO6429's control character set is designated/invoked to C1,
2803 and usually these designations/invocations are omitted in encoded
2804 text. In a 7-bit environment, only C0 can be used, and a control
2805 character for C1 is encoded by an appropriate escape sequence to
2806 fit into the environment. All control characters for C1 are
2807 defined to have corresponding escape sequences.
2809 A graphic character set is at first designated to one of four
2810 graphic registers (G0 through G3), then these graphic registers are
2811 invoked to GL or GR. These designations and invocations can be
2812 done independently. The most common case is that G0 is invoked to
2813 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2814 these invocations and designations are omitted in encoded text.
2815 In a 7-bit environment, only GL can be used.
2817 When a graphic character set of CHARS94 is invoked to GL, codes
2818 0x20 and 0x7F of the GL area work as control characters SPACE and
2819 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2820 be used.
2822 There are two ways of invocation: locking-shift and single-shift.
2823 With locking-shift, the invocation lasts until the next different
2824 invocation, whereas with single-shift, the invocation affects the
2825 following character only and doesn't affect the locking-shift
2826 state. Invocations are done by the following control characters or
2827 escape sequences:
2829 ----------------------------------------------------------------------
2830 abbrev function cntrl escape seq description
2831 ----------------------------------------------------------------------
2832 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2833 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2834 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2835 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2836 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2837 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2838 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2839 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2840 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2841 ----------------------------------------------------------------------
2842 (*) These are not used by any known coding system.
2844 Control characters for these functions are defined by macros
2845 ISO_CODE_XXX in `coding.h'.
2847 Designations are done by the following escape sequences:
2848 ----------------------------------------------------------------------
2849 escape sequence description
2850 ----------------------------------------------------------------------
2851 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2852 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2853 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2854 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2855 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2856 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2857 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2858 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2859 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2860 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2861 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2862 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2863 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2864 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2865 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2866 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2867 ----------------------------------------------------------------------
2869 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2870 of dimension 1, chars 94, and final character <F>, etc...
2872 Note (*): Although these designations are not allowed in ISO2022,
2873 Emacs accepts them on decoding, and produces them on encoding
2874 CHARS96 character sets in a coding system which is characterized as
2875 7-bit environment, non-locking-shift, and non-single-shift.
2877 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2878 '(' must be omitted. We refer to this as "short-form" hereafter.
2880 Now you may notice that there are a lot of ways of encoding the
2881 same multilingual text in ISO2022. Actually, there exist many
2882 coding systems such as Compound Text (used in X11's inter client
2883 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2884 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2885 localized platforms), and all of these are variants of ISO2022.
2887 In addition to the above, Emacs handles two more kinds of escape
2888 sequences: ISO6429's direction specification and Emacs' private
2889 sequence for specifying character composition.
2891 ISO6429's direction specification takes the following form:
2892 o CSI ']' -- end of the current direction
2893 o CSI '0' ']' -- end of the current direction
2894 o CSI '1' ']' -- start of left-to-right text
2895 o CSI '2' ']' -- start of right-to-left text
2896 The control character CSI (0x9B: control sequence introducer) is
2897 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2899 Character composition specification takes the following form:
2900 o ESC '0' -- start relative composition
2901 o ESC '1' -- end composition
2902 o ESC '2' -- start rule-base composition (*)
2903 o ESC '3' -- start relative composition with alternate chars (**)
2904 o ESC '4' -- start rule-base composition with alternate chars (**)
2905 Since these are not standard escape sequences of any ISO standard,
2906 the use of them with these meanings is restricted to Emacs only.
2908 (*) This form is used only in Emacs 20.7 and older versions,
2909 but newer versions can safely decode it.
2910 (**) This form is used only in Emacs 21.1 and newer versions,
2911 and older versions can't decode it.
2913 Here's a list of example usages of these composition escape
2914 sequences (categorized by `enum composition_method').
2916 COMPOSITION_RELATIVE:
2917 ESC 0 CHAR [ CHAR ] ESC 1
2918 COMPOSITION_WITH_RULE:
2919 ESC 2 CHAR [ RULE CHAR ] ESC 1
2920 COMPOSITION_WITH_ALTCHARS:
2921 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2922 COMPOSITION_WITH_RULE_ALTCHARS:
2923 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2925 static enum iso_code_class_type iso_code_class[256];
2927 #define SAFE_CHARSET_P(coding, id) \
2928 ((id) <= (coding)->max_charset_id \
2929 && (coding)->safe_charsets[id] != 255)
2931 static void
2932 setup_iso_safe_charsets (Lisp_Object attrs)
2934 Lisp_Object charset_list, safe_charsets;
2935 Lisp_Object request;
2936 Lisp_Object reg_usage;
2937 Lisp_Object tail;
2938 EMACS_INT reg94, reg96;
2939 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2940 int max_charset_id;
2942 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2943 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2944 && ! EQ (charset_list, Viso_2022_charset_list))
2946 charset_list = Viso_2022_charset_list;
2947 ASET (attrs, coding_attr_charset_list, charset_list);
2948 ASET (attrs, coding_attr_safe_charsets, Qnil);
2951 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2952 return;
2954 max_charset_id = 0;
2955 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2957 int id = XINT (XCAR (tail));
2958 if (max_charset_id < id)
2959 max_charset_id = id;
2962 safe_charsets = make_uninit_string (max_charset_id + 1);
2963 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2964 request = AREF (attrs, coding_attr_iso_request);
2965 reg_usage = AREF (attrs, coding_attr_iso_usage);
2966 reg94 = XINT (XCAR (reg_usage));
2967 reg96 = XINT (XCDR (reg_usage));
2969 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2971 Lisp_Object id;
2972 Lisp_Object reg;
2973 struct charset *charset;
2975 id = XCAR (tail);
2976 charset = CHARSET_FROM_ID (XINT (id));
2977 reg = Fcdr (Fassq (id, request));
2978 if (! NILP (reg))
2979 SSET (safe_charsets, XINT (id), XINT (reg));
2980 else if (charset->iso_chars_96)
2982 if (reg96 < 4)
2983 SSET (safe_charsets, XINT (id), reg96);
2985 else
2987 if (reg94 < 4)
2988 SSET (safe_charsets, XINT (id), reg94);
2991 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2996 Return true if a text is encoded in one of ISO-2022 based coding
2997 systems. */
2999 static bool
3000 detect_coding_iso_2022 (struct coding_system *coding,
3001 struct coding_detection_info *detect_info)
3003 const unsigned char *src = coding->source, *src_base = src;
3004 const unsigned char *src_end = coding->source + coding->src_bytes;
3005 bool multibytep = coding->src_multibyte;
3006 bool single_shifting = 0;
3007 int id;
3008 int c, c1;
3009 ptrdiff_t consumed_chars = 0;
3010 int i;
3011 int rejected = 0;
3012 int found = 0;
3013 int composition_count = -1;
3015 detect_info->checked |= CATEGORY_MASK_ISO;
3017 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3019 struct coding_system *this = &(coding_categories[i]);
3020 Lisp_Object attrs, val;
3022 if (this->id < 0)
3023 continue;
3024 attrs = CODING_ID_ATTRS (this->id);
3025 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3026 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3027 setup_iso_safe_charsets (attrs);
3028 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3029 this->max_charset_id = SCHARS (val) - 1;
3030 this->safe_charsets = SDATA (val);
3033 /* A coding system of this category is always ASCII compatible. */
3034 src += coding->head_ascii;
3036 while (rejected != CATEGORY_MASK_ISO)
3038 src_base = src;
3039 ONE_MORE_BYTE (c);
3040 switch (c)
3042 case ISO_CODE_ESC:
3043 if (inhibit_iso_escape_detection)
3044 break;
3045 single_shifting = 0;
3046 ONE_MORE_BYTE (c);
3047 if (c == 'N' || c == 'O')
3049 /* ESC <Fe> for SS2 or SS3. */
3050 single_shifting = 1;
3051 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3053 else if (c == '1')
3055 /* End of composition. */
3056 if (composition_count < 0
3057 || composition_count > MAX_COMPOSITION_COMPONENTS)
3058 /* Invalid */
3059 break;
3060 composition_count = -1;
3061 found |= CATEGORY_MASK_ISO;
3063 else if (c >= '0' && c <= '4')
3065 /* ESC <Fp> for start/end composition. */
3066 composition_count = 0;
3068 else
3070 if (c >= '(' && c <= '/')
3072 /* Designation sequence for a charset of dimension 1. */
3073 ONE_MORE_BYTE (c1);
3074 if (c1 < ' ' || c1 >= 0x80
3075 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3077 /* Invalid designation sequence. Just ignore. */
3078 if (c1 >= 0x80)
3079 rejected |= (CATEGORY_MASK_ISO_7BIT
3080 | CATEGORY_MASK_ISO_7_ELSE);
3081 break;
3084 else if (c == '$')
3086 /* Designation sequence for a charset of dimension 2. */
3087 ONE_MORE_BYTE (c);
3088 if (c >= '@' && c <= 'B')
3089 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3090 id = iso_charset_table[1][0][c];
3091 else if (c >= '(' && c <= '/')
3093 ONE_MORE_BYTE (c1);
3094 if (c1 < ' ' || c1 >= 0x80
3095 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3097 /* Invalid designation sequence. Just ignore. */
3098 if (c1 >= 0x80)
3099 rejected |= (CATEGORY_MASK_ISO_7BIT
3100 | CATEGORY_MASK_ISO_7_ELSE);
3101 break;
3104 else
3106 /* Invalid designation sequence. Just ignore it. */
3107 if (c >= 0x80)
3108 rejected |= (CATEGORY_MASK_ISO_7BIT
3109 | CATEGORY_MASK_ISO_7_ELSE);
3110 break;
3113 else
3115 /* Invalid escape sequence. Just ignore it. */
3116 if (c >= 0x80)
3117 rejected |= (CATEGORY_MASK_ISO_7BIT
3118 | CATEGORY_MASK_ISO_7_ELSE);
3119 break;
3122 /* We found a valid designation sequence for CHARSET. */
3123 rejected |= CATEGORY_MASK_ISO_8BIT;
3124 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3125 id))
3126 found |= CATEGORY_MASK_ISO_7;
3127 else
3128 rejected |= CATEGORY_MASK_ISO_7;
3129 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3130 id))
3131 found |= CATEGORY_MASK_ISO_7_TIGHT;
3132 else
3133 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3134 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3135 id))
3136 found |= CATEGORY_MASK_ISO_7_ELSE;
3137 else
3138 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3139 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3140 id))
3141 found |= CATEGORY_MASK_ISO_8_ELSE;
3142 else
3143 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3145 break;
3147 case ISO_CODE_SO:
3148 case ISO_CODE_SI:
3149 /* Locking shift out/in. */
3150 if (inhibit_iso_escape_detection)
3151 break;
3152 single_shifting = 0;
3153 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3154 break;
3156 case ISO_CODE_CSI:
3157 /* Control sequence introducer. */
3158 single_shifting = 0;
3159 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3160 found |= CATEGORY_MASK_ISO_8_ELSE;
3161 goto check_extra_latin;
3163 case ISO_CODE_SS2:
3164 case ISO_CODE_SS3:
3165 /* Single shift. */
3166 if (inhibit_iso_escape_detection)
3167 break;
3168 single_shifting = 0;
3169 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3170 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3171 & CODING_ISO_FLAG_SINGLE_SHIFT)
3173 found |= CATEGORY_MASK_ISO_8_1;
3174 single_shifting = 1;
3176 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3177 & CODING_ISO_FLAG_SINGLE_SHIFT)
3179 found |= CATEGORY_MASK_ISO_8_2;
3180 single_shifting = 1;
3182 if (single_shifting)
3183 break;
3184 goto check_extra_latin;
3186 default:
3187 if (c < 0)
3188 continue;
3189 if (c < 0x80)
3191 if (composition_count >= 0)
3192 composition_count++;
3193 single_shifting = 0;
3194 break;
3196 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3197 if (c >= 0xA0)
3199 found |= CATEGORY_MASK_ISO_8_1;
3200 /* Check the length of succeeding codes of the range
3201 0xA0..0FF. If the byte length is even, we include
3202 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3203 only when we are not single shifting. */
3204 if (! single_shifting
3205 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3207 ptrdiff_t len = 1;
3208 while (src < src_end)
3210 src_base = src;
3211 ONE_MORE_BYTE (c);
3212 if (c < 0xA0)
3214 src = src_base;
3215 break;
3217 len++;
3220 if (len & 1 && src < src_end)
3222 rejected |= CATEGORY_MASK_ISO_8_2;
3223 if (composition_count >= 0)
3224 composition_count += len;
3226 else
3228 found |= CATEGORY_MASK_ISO_8_2;
3229 if (composition_count >= 0)
3230 composition_count += len / 2;
3233 break;
3235 check_extra_latin:
3236 if (! VECTORP (Vlatin_extra_code_table)
3237 || NILP (AREF (Vlatin_extra_code_table, c)))
3239 rejected = CATEGORY_MASK_ISO;
3240 break;
3242 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3243 & CODING_ISO_FLAG_LATIN_EXTRA)
3244 found |= CATEGORY_MASK_ISO_8_1;
3245 else
3246 rejected |= CATEGORY_MASK_ISO_8_1;
3247 rejected |= CATEGORY_MASK_ISO_8_2;
3248 break;
3251 detect_info->rejected |= CATEGORY_MASK_ISO;
3252 return 0;
3254 no_more_source:
3255 detect_info->rejected |= rejected;
3256 detect_info->found |= (found & ~rejected);
3257 return 1;
3261 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3262 escape sequence should be kept. */
3263 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3264 do { \
3265 int id, prev; \
3267 if (final < '0' || final >= 128 \
3268 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3269 || !SAFE_CHARSET_P (coding, id)) \
3271 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3272 chars_96 = -1; \
3273 break; \
3275 prev = CODING_ISO_DESIGNATION (coding, reg); \
3276 if (id == charset_jisx0201_roman) \
3278 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3279 id = charset_ascii; \
3281 else if (id == charset_jisx0208_1978) \
3283 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3284 id = charset_jisx0208; \
3286 CODING_ISO_DESIGNATION (coding, reg) = id; \
3287 /* If there was an invalid designation to REG previously, and this \
3288 designation is ASCII to REG, we should keep this designation \
3289 sequence. */ \
3290 if (prev == -2 && id == charset_ascii) \
3291 chars_96 = -1; \
3292 } while (0)
3295 /* Handle these composition sequence (ALT: alternate char):
3297 (1) relative composition: ESC 0 CHAR ... ESC 1
3298 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3299 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3300 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3302 When the start sequence (ESC 0/2/3/4) is found, this annotation
3303 header is produced.
3305 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3307 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3308 produced until the end sequence (ESC 1) is found:
3310 (1) CHAR ... CHAR
3311 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3312 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3313 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3315 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3316 annotation header is updated as below:
3318 (1) LENGTH: unchanged, NCHARS: number of CHARs
3319 (2) LENGTH: unchanged, NCHARS: number of CHARs
3320 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3321 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3323 If an error is found while composing, the annotation header is
3324 changed to:
3326 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3328 and the sequence [ -2 DECODED-RULE ] is changed to the original
3329 byte sequence as below:
3330 o the original byte sequence is B: [ B -1 ]
3331 o the original byte sequence is B1 B2: [ B1 B2 ]
3332 and the sequence [ -1 -1 ] is changed to the original byte
3333 sequence:
3334 [ ESC '0' ]
3337 /* Decode a composition rule C1 and maybe one more byte from the
3338 source, and set RULE to the encoded composition rule. If the rule
3339 is invalid, goto invalid_code. */
3341 #define DECODE_COMPOSITION_RULE(rule) \
3342 do { \
3343 rule = c1 - 32; \
3344 if (rule < 0) \
3345 goto invalid_code; \
3346 if (rule < 81) /* old format (before ver.21) */ \
3348 int gref = (rule) / 9; \
3349 int nref = (rule) % 9; \
3350 if (gref == 4) gref = 10; \
3351 if (nref == 4) nref = 10; \
3352 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3354 else /* new format (after ver.21) */ \
3356 int b; \
3358 ONE_MORE_BYTE (b); \
3359 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3360 goto invalid_code; \
3361 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3362 rule += 0x100; /* Distinguish it from the old format. */ \
3364 } while (0)
3366 #define ENCODE_COMPOSITION_RULE(rule) \
3367 do { \
3368 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3370 if (rule < 0x100) /* old format */ \
3372 if (gref == 10) gref = 4; \
3373 if (nref == 10) nref = 4; \
3374 charbuf[idx] = 32 + gref * 9 + nref; \
3375 charbuf[idx + 1] = -1; \
3376 new_chars++; \
3378 else /* new format */ \
3380 charbuf[idx] = 32 + 81 + gref; \
3381 charbuf[idx + 1] = 32 + nref; \
3382 new_chars += 2; \
3384 } while (0)
3386 /* Finish the current composition as invalid. */
3388 static int
3389 finish_composition (int *charbuf, struct composition_status *cmp_status)
3391 int idx = - cmp_status->length;
3392 int new_chars;
3394 /* Recover the original ESC sequence */
3395 charbuf[idx++] = ISO_CODE_ESC;
3396 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3397 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3398 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3399 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3400 : '4');
3401 charbuf[idx++] = -2;
3402 charbuf[idx++] = 0;
3403 charbuf[idx++] = -1;
3404 new_chars = cmp_status->nchars;
3405 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3406 for (; idx < 0; idx++)
3408 int elt = charbuf[idx];
3410 if (elt == -2)
3412 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3413 idx++;
3415 else if (elt == -1)
3417 charbuf[idx++] = ISO_CODE_ESC;
3418 charbuf[idx] = '0';
3419 new_chars += 2;
3422 cmp_status->state = COMPOSING_NO;
3423 return new_chars;
3426 /* If characters are under composition, finish the composition. */
3427 #define MAYBE_FINISH_COMPOSITION() \
3428 do { \
3429 if (cmp_status->state != COMPOSING_NO) \
3430 char_offset += finish_composition (charbuf, cmp_status); \
3431 } while (0)
3433 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3435 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3436 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3437 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3438 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3440 Produce this annotation sequence now:
3442 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3445 #define DECODE_COMPOSITION_START(c1) \
3446 do { \
3447 if (c1 == '0' \
3448 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3449 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3450 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3451 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3453 *charbuf++ = -1; \
3454 *charbuf++= -1; \
3455 cmp_status->state = COMPOSING_CHAR; \
3456 cmp_status->length += 2; \
3458 else \
3460 MAYBE_FINISH_COMPOSITION (); \
3461 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3462 : c1 == '2' ? COMPOSITION_WITH_RULE \
3463 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3464 : COMPOSITION_WITH_RULE_ALTCHARS); \
3465 cmp_status->state \
3466 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3467 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3468 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3469 cmp_status->nchars = cmp_status->ncomps = 0; \
3470 coding->annotated = 1; \
3472 } while (0)
3475 /* Handle composition end sequence ESC 1. */
3477 #define DECODE_COMPOSITION_END() \
3478 do { \
3479 if (cmp_status->nchars == 0 \
3480 || ((cmp_status->state == COMPOSING_CHAR) \
3481 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3483 MAYBE_FINISH_COMPOSITION (); \
3484 goto invalid_code; \
3486 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3487 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3488 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3489 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3490 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3491 char_offset += cmp_status->nchars; \
3492 cmp_status->state = COMPOSING_NO; \
3493 } while (0)
3495 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3497 #define STORE_COMPOSITION_RULE(rule) \
3498 do { \
3499 *charbuf++ = -2; \
3500 *charbuf++ = rule; \
3501 cmp_status->length += 2; \
3502 cmp_status->state--; \
3503 } while (0)
3505 /* Store a composed char or a component char C in charbuf, and update
3506 cmp_status. */
3508 #define STORE_COMPOSITION_CHAR(c) \
3509 do { \
3510 *charbuf++ = (c); \
3511 cmp_status->length++; \
3512 if (cmp_status->state == COMPOSING_CHAR) \
3513 cmp_status->nchars++; \
3514 else \
3515 cmp_status->ncomps++; \
3516 if (cmp_status->method == COMPOSITION_WITH_RULE \
3517 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3518 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3519 cmp_status->state++; \
3520 } while (0)
3523 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3525 static void
3526 decode_coding_iso_2022 (struct coding_system *coding)
3528 const unsigned char *src = coding->source + coding->consumed;
3529 const unsigned char *src_end = coding->source + coding->src_bytes;
3530 const unsigned char *src_base;
3531 int *charbuf = coding->charbuf + coding->charbuf_used;
3532 /* We may produce two annotations (charset and composition) in one
3533 loop and one more charset annotation at the end. */
3534 int *charbuf_end
3535 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3536 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3537 bool multibytep = coding->src_multibyte;
3538 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3539 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3540 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3541 int charset_id_2, charset_id_3;
3542 struct charset *charset;
3543 int c;
3544 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3545 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3546 ptrdiff_t char_offset = coding->produced_char;
3547 ptrdiff_t last_offset = char_offset;
3548 int last_id = charset_ascii;
3549 bool eol_dos
3550 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3551 int byte_after_cr = -1;
3552 int i;
3554 setup_iso_safe_charsets (attrs);
3555 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3557 if (cmp_status->state != COMPOSING_NO)
3559 if (charbuf_end - charbuf < cmp_status->length)
3560 emacs_abort ();
3561 for (i = 0; i < cmp_status->length; i++)
3562 *charbuf++ = cmp_status->carryover[i];
3563 coding->annotated = 1;
3566 while (1)
3568 int c1, c2, c3;
3570 src_base = src;
3571 consumed_chars_base = consumed_chars;
3573 if (charbuf >= charbuf_end)
3575 if (byte_after_cr >= 0)
3576 src_base--;
3577 break;
3580 if (byte_after_cr >= 0)
3581 c1 = byte_after_cr, byte_after_cr = -1;
3582 else
3583 ONE_MORE_BYTE (c1);
3584 if (c1 < 0)
3585 goto invalid_code;
3587 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3589 *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3590 char_offset++;
3591 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3592 continue;
3595 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3597 if (c1 == ISO_CODE_ESC)
3599 if (src + 1 >= src_end)
3600 goto no_more_source;
3601 *charbuf++ = ISO_CODE_ESC;
3602 char_offset++;
3603 if (src[0] == '%' && src[1] == '@')
3605 src += 2;
3606 consumed_chars += 2;
3607 char_offset += 2;
3608 /* We are sure charbuf can contain two more chars. */
3609 *charbuf++ = '%';
3610 *charbuf++ = '@';
3611 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3614 else
3616 *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3617 char_offset++;
3619 continue;
3622 if ((cmp_status->state == COMPOSING_RULE
3623 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3624 && c1 != ISO_CODE_ESC)
3626 int rule;
3628 DECODE_COMPOSITION_RULE (rule);
3629 STORE_COMPOSITION_RULE (rule);
3630 continue;
3633 /* We produce at most one character. */
3634 switch (iso_code_class [c1])
3636 case ISO_0x20_or_0x7F:
3637 if (charset_id_0 < 0
3638 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3639 /* This is SPACE or DEL. */
3640 charset = CHARSET_FROM_ID (charset_ascii);
3641 else
3642 charset = CHARSET_FROM_ID (charset_id_0);
3643 break;
3645 case ISO_graphic_plane_0:
3646 if (charset_id_0 < 0)
3647 charset = CHARSET_FROM_ID (charset_ascii);
3648 else
3649 charset = CHARSET_FROM_ID (charset_id_0);
3650 break;
3652 case ISO_0xA0_or_0xFF:
3653 if (charset_id_1 < 0
3654 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3655 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3656 goto invalid_code;
3657 /* This is a graphic character, we fall down ... */
3659 case ISO_graphic_plane_1:
3660 if (charset_id_1 < 0)
3661 goto invalid_code;
3662 charset = CHARSET_FROM_ID (charset_id_1);
3663 break;
3665 case ISO_control_0:
3666 if (eol_dos && c1 == '\r')
3667 ONE_MORE_BYTE (byte_after_cr);
3668 MAYBE_FINISH_COMPOSITION ();
3669 charset = CHARSET_FROM_ID (charset_ascii);
3670 break;
3672 case ISO_control_1:
3673 goto invalid_code;
3675 case ISO_shift_out:
3676 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3677 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3678 goto invalid_code;
3679 CODING_ISO_INVOCATION (coding, 0) = 1;
3680 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3681 continue;
3683 case ISO_shift_in:
3684 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3685 goto invalid_code;
3686 CODING_ISO_INVOCATION (coding, 0) = 0;
3687 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3688 continue;
3690 case ISO_single_shift_2_7:
3691 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3692 goto invalid_code;
3693 case ISO_single_shift_2:
3694 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3695 goto invalid_code;
3696 /* SS2 is handled as an escape sequence of ESC 'N' */
3697 c1 = 'N';
3698 goto label_escape_sequence;
3700 case ISO_single_shift_3:
3701 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702 goto invalid_code;
3703 /* SS2 is handled as an escape sequence of ESC 'O' */
3704 c1 = 'O';
3705 goto label_escape_sequence;
3707 case ISO_control_sequence_introducer:
3708 /* CSI is handled as an escape sequence of ESC '[' ... */
3709 c1 = '[';
3710 goto label_escape_sequence;
3712 case ISO_escape:
3713 ONE_MORE_BYTE (c1);
3714 label_escape_sequence:
3715 /* Escape sequences handled here are invocation,
3716 designation, direction specification, and character
3717 composition specification. */
3718 switch (c1)
3720 case '&': /* revision of following character set */
3721 ONE_MORE_BYTE (c1);
3722 if (!(c1 >= '@' && c1 <= '~'))
3723 goto invalid_code;
3724 ONE_MORE_BYTE (c1);
3725 if (c1 != ISO_CODE_ESC)
3726 goto invalid_code;
3727 ONE_MORE_BYTE (c1);
3728 goto label_escape_sequence;
3730 case '$': /* designation of 2-byte character set */
3731 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3732 goto invalid_code;
3734 int reg, chars96;
3736 ONE_MORE_BYTE (c1);
3737 if (c1 >= '@' && c1 <= 'B')
3738 { /* designation of JISX0208.1978, GB2312.1980,
3739 or JISX0208.1980 */
3740 reg = 0, chars96 = 0;
3742 else if (c1 >= 0x28 && c1 <= 0x2B)
3743 { /* designation of DIMENSION2_CHARS94 character set */
3744 reg = c1 - 0x28, chars96 = 0;
3745 ONE_MORE_BYTE (c1);
3747 else if (c1 >= 0x2C && c1 <= 0x2F)
3748 { /* designation of DIMENSION2_CHARS96 character set */
3749 reg = c1 - 0x2C, chars96 = 1;
3750 ONE_MORE_BYTE (c1);
3752 else
3753 goto invalid_code;
3754 DECODE_DESIGNATION (reg, 2, chars96, c1);
3755 /* We must update these variables now. */
3756 if (reg == 0)
3757 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758 else if (reg == 1)
3759 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3760 if (chars96 < 0)
3761 goto invalid_code;
3763 continue;
3765 case 'n': /* invocation of locking-shift-2 */
3766 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3767 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3768 goto invalid_code;
3769 CODING_ISO_INVOCATION (coding, 0) = 2;
3770 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3771 continue;
3773 case 'o': /* invocation of locking-shift-3 */
3774 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3775 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3776 goto invalid_code;
3777 CODING_ISO_INVOCATION (coding, 0) = 3;
3778 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3779 continue;
3781 case 'N': /* invocation of single-shift-2 */
3782 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3783 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3784 goto invalid_code;
3785 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3786 if (charset_id_2 < 0)
3787 charset = CHARSET_FROM_ID (charset_ascii);
3788 else
3789 charset = CHARSET_FROM_ID (charset_id_2);
3790 ONE_MORE_BYTE (c1);
3791 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3792 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3793 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3794 ? c1 >= 0x80 : c1 < 0x80)))
3795 goto invalid_code;
3796 break;
3798 case 'O': /* invocation of single-shift-3 */
3799 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3800 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3801 goto invalid_code;
3802 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3803 if (charset_id_3 < 0)
3804 charset = CHARSET_FROM_ID (charset_ascii);
3805 else
3806 charset = CHARSET_FROM_ID (charset_id_3);
3807 ONE_MORE_BYTE (c1);
3808 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3809 || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3810 && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3811 ? c1 >= 0x80 : c1 < 0x80)))
3812 goto invalid_code;
3813 break;
3815 case '0': case '2': case '3': case '4': /* start composition */
3816 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3817 goto invalid_code;
3818 if (last_id != charset_ascii)
3820 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3821 last_id = charset_ascii;
3822 last_offset = char_offset;
3824 DECODE_COMPOSITION_START (c1);
3825 continue;
3827 case '1': /* end composition */
3828 if (cmp_status->state == COMPOSING_NO)
3829 goto invalid_code;
3830 DECODE_COMPOSITION_END ();
3831 continue;
3833 case '[': /* specification of direction */
3834 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3835 goto invalid_code;
3836 /* For the moment, nested direction is not supported.
3837 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3838 left-to-right, and nonzero means right-to-left. */
3839 ONE_MORE_BYTE (c1);
3840 switch (c1)
3842 case ']': /* end of the current direction */
3843 coding->mode &= ~CODING_MODE_DIRECTION;
3845 case '0': /* end of the current direction */
3846 case '1': /* start of left-to-right direction */
3847 ONE_MORE_BYTE (c1);
3848 if (c1 == ']')
3849 coding->mode &= ~CODING_MODE_DIRECTION;
3850 else
3851 goto invalid_code;
3852 break;
3854 case '2': /* start of right-to-left direction */
3855 ONE_MORE_BYTE (c1);
3856 if (c1 == ']')
3857 coding->mode |= CODING_MODE_DIRECTION;
3858 else
3859 goto invalid_code;
3860 break;
3862 default:
3863 goto invalid_code;
3865 continue;
3867 case '%':
3868 ONE_MORE_BYTE (c1);
3869 if (c1 == '/')
3871 /* CTEXT extended segment:
3872 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3873 We keep these bytes as is for the moment.
3874 They may be decoded by post-read-conversion. */
3875 int dim, M, L;
3876 int size;
3878 ONE_MORE_BYTE (dim);
3879 if (dim < '0' || dim > '4')
3880 goto invalid_code;
3881 ONE_MORE_BYTE (M);
3882 if (M < 128)
3883 goto invalid_code;
3884 ONE_MORE_BYTE (L);
3885 if (L < 128)
3886 goto invalid_code;
3887 size = ((M - 128) * 128) + (L - 128);
3888 if (charbuf + 6 > charbuf_end)
3889 goto break_loop;
3890 *charbuf++ = ISO_CODE_ESC;
3891 *charbuf++ = '%';
3892 *charbuf++ = '/';
3893 *charbuf++ = dim;
3894 *charbuf++ = BYTE8_TO_CHAR (M);
3895 *charbuf++ = BYTE8_TO_CHAR (L);
3896 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3898 else if (c1 == 'G')
3900 /* XFree86 extension for embedding UTF-8 in CTEXT:
3901 ESC % G --UTF-8-BYTES-- ESC % @
3902 We keep these bytes as is for the moment.
3903 They may be decoded by post-read-conversion. */
3904 if (charbuf + 3 > charbuf_end)
3905 goto break_loop;
3906 *charbuf++ = ISO_CODE_ESC;
3907 *charbuf++ = '%';
3908 *charbuf++ = 'G';
3909 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3911 else
3912 goto invalid_code;
3913 continue;
3914 break;
3916 default:
3917 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3918 goto invalid_code;
3920 int reg, chars96;
3922 if (c1 >= 0x28 && c1 <= 0x2B)
3923 { /* designation of DIMENSION1_CHARS94 character set */
3924 reg = c1 - 0x28, chars96 = 0;
3925 ONE_MORE_BYTE (c1);
3927 else if (c1 >= 0x2C && c1 <= 0x2F)
3928 { /* designation of DIMENSION1_CHARS96 character set */
3929 reg = c1 - 0x2C, chars96 = 1;
3930 ONE_MORE_BYTE (c1);
3932 else
3933 goto invalid_code;
3934 DECODE_DESIGNATION (reg, 1, chars96, c1);
3935 /* We must update these variables now. */
3936 if (reg == 0)
3937 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3938 else if (reg == 1)
3939 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3940 if (chars96 < 0)
3941 goto invalid_code;
3943 continue;
3945 break;
3947 default:
3948 emacs_abort ();
3951 if (cmp_status->state == COMPOSING_NO
3952 && charset->id != charset_ascii
3953 && last_id != charset->id)
3955 if (last_id != charset_ascii)
3956 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3957 last_id = charset->id;
3958 last_offset = char_offset;
3961 /* Now we know CHARSET and 1st position code C1 of a character.
3962 Produce a decoded character while getting 2nd and 3rd
3963 position codes C2, C3 if necessary. */
3964 if (CHARSET_DIMENSION (charset) > 1)
3966 ONE_MORE_BYTE (c2);
3967 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3968 || ((c1 & 0x80) != (c2 & 0x80)))
3969 /* C2 is not in a valid range. */
3970 goto invalid_code;
3971 if (CHARSET_DIMENSION (charset) == 2)
3972 c1 = (c1 << 8) | c2;
3973 else
3975 ONE_MORE_BYTE (c3);
3976 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3977 || ((c1 & 0x80) != (c3 & 0x80)))
3978 /* C3 is not in a valid range. */
3979 goto invalid_code;
3980 c1 = (c1 << 16) | (c2 << 8) | c2;
3983 c1 &= 0x7F7F7F;
3984 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3985 if (c < 0)
3987 MAYBE_FINISH_COMPOSITION ();
3988 for (; src_base < src; src_base++, char_offset++)
3990 if (ASCII_CHAR_P (*src_base))
3991 *charbuf++ = *src_base;
3992 else
3993 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3996 else if (cmp_status->state == COMPOSING_NO)
3998 *charbuf++ = c;
3999 char_offset++;
4001 else if ((cmp_status->state == COMPOSING_CHAR
4002 ? cmp_status->nchars
4003 : cmp_status->ncomps)
4004 >= MAX_COMPOSITION_COMPONENTS)
4006 /* Too long composition. */
4007 MAYBE_FINISH_COMPOSITION ();
4008 *charbuf++ = c;
4009 char_offset++;
4011 else
4012 STORE_COMPOSITION_CHAR (c);
4013 continue;
4015 invalid_code:
4016 MAYBE_FINISH_COMPOSITION ();
4017 src = src_base;
4018 consumed_chars = consumed_chars_base;
4019 ONE_MORE_BYTE (c);
4020 *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4021 char_offset++;
4022 /* Reset the invocation and designation status to the safest
4023 one; i.e. designate ASCII to the graphic register 0, and
4024 invoke that register to the graphic plane 0. This typically
4025 helps the case that an designation sequence for ASCII "ESC (
4026 B" is somehow broken (e.g. broken by a newline). */
4027 CODING_ISO_INVOCATION (coding, 0) = 0;
4028 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4029 charset_id_0 = charset_ascii;
4030 continue;
4032 break_loop:
4033 break;
4036 no_more_source:
4037 if (cmp_status->state != COMPOSING_NO)
4039 if (coding->mode & CODING_MODE_LAST_BLOCK)
4040 MAYBE_FINISH_COMPOSITION ();
4041 else
4043 charbuf -= cmp_status->length;
4044 for (i = 0; i < cmp_status->length; i++)
4045 cmp_status->carryover[i] = charbuf[i];
4048 else if (last_id != charset_ascii)
4049 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4050 coding->consumed_char += consumed_chars_base;
4051 coding->consumed = src_base - coding->source;
4052 coding->charbuf_used = charbuf - coding->charbuf;
4056 /* ISO2022 encoding stuff. */
4059 It is not enough to say just "ISO2022" on encoding, we have to
4060 specify more details. In Emacs, each coding system of ISO2022
4061 variant has the following specifications:
4062 1. Initial designation to G0 thru G3.
4063 2. Allows short-form designation?
4064 3. ASCII should be designated to G0 before control characters?
4065 4. ASCII should be designated to G0 at end of line?
4066 5. 7-bit environment or 8-bit environment?
4067 6. Use locking-shift?
4068 7. Use Single-shift?
4069 And the following two are only for Japanese:
4070 8. Use ASCII in place of JIS0201-1976-Roman?
4071 9. Use JISX0208-1983 in place of JISX0208-1978?
4072 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4073 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4074 details.
4077 /* Produce codes (escape sequence) for designating CHARSET to graphic
4078 register REG at DST, and increment DST. If <final-char> of CHARSET is
4079 '@', 'A', or 'B' and the coding system CODING allows, produce
4080 designation sequence of short-form. */
4082 #define ENCODE_DESIGNATION(charset, reg, coding) \
4083 do { \
4084 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4085 const char *intermediate_char_94 = "()*+"; \
4086 const char *intermediate_char_96 = ",-./"; \
4087 int revision = -1; \
4089 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4090 revision = CHARSET_ISO_REVISION (charset); \
4092 if (revision >= 0) \
4094 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4095 EMIT_ONE_BYTE ('@' + revision); \
4097 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4098 if (CHARSET_DIMENSION (charset) == 1) \
4100 int b; \
4101 if (! CHARSET_ISO_CHARS_96 (charset)) \
4102 b = intermediate_char_94[reg]; \
4103 else \
4104 b = intermediate_char_96[reg]; \
4105 EMIT_ONE_ASCII_BYTE (b); \
4107 else \
4109 EMIT_ONE_ASCII_BYTE ('$'); \
4110 if (! CHARSET_ISO_CHARS_96 (charset)) \
4112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4113 || reg != 0 \
4114 || final_char < '@' || final_char > 'B') \
4115 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4117 else \
4118 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4120 EMIT_ONE_ASCII_BYTE (final_char); \
4122 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4123 } while (0)
4126 /* The following two macros produce codes (control character or escape
4127 sequence) for ISO2022 single-shift functions (single-shift-2 and
4128 single-shift-3). */
4130 #define ENCODE_SINGLE_SHIFT_2 \
4131 do { \
4132 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4133 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4134 else \
4135 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4136 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4137 } while (0)
4140 #define ENCODE_SINGLE_SHIFT_3 \
4141 do { \
4142 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4143 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4144 else \
4145 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4146 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4147 } while (0)
4150 /* The following four macros produce codes (control character or
4151 escape sequence) for ISO2022 locking-shift functions (shift-in,
4152 shift-out, locking-shift-2, and locking-shift-3). */
4154 #define ENCODE_SHIFT_IN \
4155 do { \
4156 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4157 CODING_ISO_INVOCATION (coding, 0) = 0; \
4158 } while (0)
4161 #define ENCODE_SHIFT_OUT \
4162 do { \
4163 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4164 CODING_ISO_INVOCATION (coding, 0) = 1; \
4165 } while (0)
4168 #define ENCODE_LOCKING_SHIFT_2 \
4169 do { \
4170 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4171 CODING_ISO_INVOCATION (coding, 0) = 2; \
4172 } while (0)
4175 #define ENCODE_LOCKING_SHIFT_3 \
4176 do { \
4177 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4178 CODING_ISO_INVOCATION (coding, 0) = 3; \
4179 } while (0)
4182 /* Produce codes for a DIMENSION1 character whose character set is
4183 CHARSET and whose position-code is C1. Designation and invocation
4184 sequences are also produced in advance if necessary. */
4186 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4187 do { \
4188 int id = CHARSET_ID (charset); \
4190 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4191 && id == charset_ascii) \
4193 id = charset_jisx0201_roman; \
4194 charset = CHARSET_FROM_ID (id); \
4197 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4200 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4201 else \
4202 EMIT_ONE_BYTE (c1 | 0x80); \
4203 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4204 break; \
4206 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4208 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4209 break; \
4211 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4213 EMIT_ONE_BYTE (c1 | 0x80); \
4214 break; \
4216 else \
4217 /* Since CHARSET is not yet invoked to any graphic planes, we \
4218 must invoke it, or, at first, designate it to some graphic \
4219 register. Then repeat the loop to actually produce the \
4220 character. */ \
4221 dst = encode_invocation_designation (charset, coding, dst, \
4222 &produced_chars); \
4223 } while (1)
4226 /* Produce codes for a DIMENSION2 character whose character set is
4227 CHARSET and whose position-codes are C1 and C2. Designation and
4228 invocation codes are also produced in advance if necessary. */
4230 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4231 do { \
4232 int id = CHARSET_ID (charset); \
4234 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4235 && id == charset_jisx0208) \
4237 id = charset_jisx0208_1978; \
4238 charset = CHARSET_FROM_ID (id); \
4241 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4243 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4244 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4245 else \
4246 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4247 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4248 break; \
4250 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4252 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4253 break; \
4255 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4257 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4258 break; \
4260 else \
4261 /* Since CHARSET is not yet invoked to any graphic planes, we \
4262 must invoke it, or, at first, designate it to some graphic \
4263 register. Then repeat the loop to actually produce the \
4264 character. */ \
4265 dst = encode_invocation_designation (charset, coding, dst, \
4266 &produced_chars); \
4267 } while (1)
4270 #define ENCODE_ISO_CHARACTER(charset, c) \
4271 do { \
4272 unsigned code; \
4273 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4275 if (CHARSET_DIMENSION (charset) == 1) \
4276 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4277 else \
4278 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4279 } while (0)
4282 /* Produce designation and invocation codes at a place pointed by DST
4283 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4284 Return new DST. */
4286 static unsigned char *
4287 encode_invocation_designation (struct charset *charset,
4288 struct coding_system *coding,
4289 unsigned char *dst, ptrdiff_t *p_nchars)
4291 bool multibytep = coding->dst_multibyte;
4292 ptrdiff_t produced_chars = *p_nchars;
4293 int reg; /* graphic register number */
4294 int id = CHARSET_ID (charset);
4296 /* At first, check designations. */
4297 for (reg = 0; reg < 4; reg++)
4298 if (id == CODING_ISO_DESIGNATION (coding, reg))
4299 break;
4301 if (reg >= 4)
4303 /* CHARSET is not yet designated to any graphic registers. */
4304 /* At first check the requested designation. */
4305 reg = CODING_ISO_REQUEST (coding, id);
4306 if (reg < 0)
4307 /* Since CHARSET requests no special designation, designate it
4308 to graphic register 0. */
4309 reg = 0;
4311 ENCODE_DESIGNATION (charset, reg, coding);
4314 if (CODING_ISO_INVOCATION (coding, 0) != reg
4315 && CODING_ISO_INVOCATION (coding, 1) != reg)
4317 /* Since the graphic register REG is not invoked to any graphic
4318 planes, invoke it to graphic plane 0. */
4319 switch (reg)
4321 case 0: /* graphic register 0 */
4322 ENCODE_SHIFT_IN;
4323 break;
4325 case 1: /* graphic register 1 */
4326 ENCODE_SHIFT_OUT;
4327 break;
4329 case 2: /* graphic register 2 */
4330 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4331 ENCODE_SINGLE_SHIFT_2;
4332 else
4333 ENCODE_LOCKING_SHIFT_2;
4334 break;
4336 case 3: /* graphic register 3 */
4337 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4338 ENCODE_SINGLE_SHIFT_3;
4339 else
4340 ENCODE_LOCKING_SHIFT_3;
4341 break;
4345 *p_nchars = produced_chars;
4346 return dst;
4350 /* Produce codes for designation and invocation to reset the graphic
4351 planes and registers to initial state. */
4352 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4353 do { \
4354 int reg; \
4355 struct charset *charset; \
4357 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4358 ENCODE_SHIFT_IN; \
4359 for (reg = 0; reg < 4; reg++) \
4360 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4361 && (CODING_ISO_DESIGNATION (coding, reg) \
4362 != CODING_ISO_INITIAL (coding, reg))) \
4364 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4365 ENCODE_DESIGNATION (charset, reg, coding); \
4367 } while (0)
4370 /* Produce designation sequences of charsets in the line started from
4371 CHARBUF to a place pointed by DST, and return the number of
4372 produced bytes. DST should not directly point a buffer text area
4373 which may be relocated by char_charset call.
4375 If the current block ends before any end-of-line, we may fail to
4376 find all the necessary designations. */
4378 static ptrdiff_t
4379 encode_designation_at_bol (struct coding_system *coding,
4380 int *charbuf, int *charbuf_end,
4381 unsigned char *dst)
4383 unsigned char *orig = dst;
4384 struct charset *charset;
4385 /* Table of charsets to be designated to each graphic register. */
4386 int r[4];
4387 int c, found = 0, reg;
4388 ptrdiff_t produced_chars = 0;
4389 bool multibytep = coding->dst_multibyte;
4390 Lisp_Object attrs;
4391 Lisp_Object charset_list;
4393 attrs = CODING_ID_ATTRS (coding->id);
4394 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4395 if (EQ (charset_list, Qiso_2022))
4396 charset_list = Viso_2022_charset_list;
4398 for (reg = 0; reg < 4; reg++)
4399 r[reg] = -1;
4401 while (charbuf < charbuf_end && found < 4)
4403 int id;
4405 c = *charbuf++;
4406 if (c == '\n')
4407 break;
4408 charset = char_charset (c, charset_list, NULL);
4409 id = CHARSET_ID (charset);
4410 reg = CODING_ISO_REQUEST (coding, id);
4411 if (reg >= 0 && r[reg] < 0)
4413 found++;
4414 r[reg] = id;
4418 if (found)
4420 for (reg = 0; reg < 4; reg++)
4421 if (r[reg] >= 0
4422 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4423 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4426 return dst - orig;
4429 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4431 static bool
4432 encode_coding_iso_2022 (struct coding_system *coding)
4434 bool multibytep = coding->dst_multibyte;
4435 int *charbuf = coding->charbuf;
4436 int *charbuf_end = charbuf + coding->charbuf_used;
4437 unsigned char *dst = coding->destination + coding->produced;
4438 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4439 int safe_room = 16;
4440 bool bol_designation
4441 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4442 && CODING_ISO_BOL (coding));
4443 ptrdiff_t produced_chars = 0;
4444 Lisp_Object attrs, eol_type, charset_list;
4445 bool ascii_compatible;
4446 int c;
4447 int preferred_charset_id = -1;
4449 CODING_GET_INFO (coding, attrs, charset_list);
4450 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4451 if (VECTORP (eol_type))
4452 eol_type = Qunix;
4454 setup_iso_safe_charsets (attrs);
4455 /* Charset list may have been changed. */
4456 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4457 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4459 ascii_compatible
4460 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4461 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4462 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4464 while (charbuf < charbuf_end)
4466 ASSURE_DESTINATION (safe_room);
4468 if (bol_designation)
4470 /* We have to produce designation sequences if any now. */
4471 unsigned char desig_buf[16];
4472 ptrdiff_t nbytes;
4473 ptrdiff_t offset;
4475 charset_map_loaded = 0;
4476 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4477 desig_buf);
4478 if (charset_map_loaded
4479 && (offset = coding_change_destination (coding)))
4481 dst += offset;
4482 dst_end += offset;
4484 memcpy (dst, desig_buf, nbytes);
4485 dst += nbytes;
4486 /* We are sure that designation sequences are all ASCII bytes. */
4487 produced_chars += nbytes;
4488 bol_designation = 0;
4489 ASSURE_DESTINATION (safe_room);
4492 c = *charbuf++;
4494 if (c < 0)
4496 /* Handle an annotation. */
4497 switch (*charbuf)
4499 case CODING_ANNOTATE_COMPOSITION_MASK:
4500 /* Not yet implemented. */
4501 break;
4502 case CODING_ANNOTATE_CHARSET_MASK:
4503 preferred_charset_id = charbuf[2];
4504 if (preferred_charset_id >= 0
4505 && NILP (Fmemq (make_number (preferred_charset_id),
4506 charset_list)))
4507 preferred_charset_id = -1;
4508 break;
4509 default:
4510 emacs_abort ();
4512 charbuf += -c - 1;
4513 continue;
4516 /* Now encode the character C. */
4517 if (c < 0x20 || c == 0x7F)
4519 if (c == '\n'
4520 || (c == '\r' && EQ (eol_type, Qmac)))
4522 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4523 ENCODE_RESET_PLANE_AND_REGISTER ();
4524 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4526 int i;
4528 for (i = 0; i < 4; i++)
4529 CODING_ISO_DESIGNATION (coding, i)
4530 = CODING_ISO_INITIAL (coding, i);
4532 bol_designation = ((CODING_ISO_FLAGS (coding)
4533 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4534 != 0);
4536 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4537 ENCODE_RESET_PLANE_AND_REGISTER ();
4538 EMIT_ONE_ASCII_BYTE (c);
4540 else if (ASCII_CHAR_P (c))
4542 if (ascii_compatible)
4543 EMIT_ONE_ASCII_BYTE (c);
4544 else
4546 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4547 ENCODE_ISO_CHARACTER (charset, c);
4550 else if (CHAR_BYTE8_P (c))
4552 c = CHAR_TO_BYTE8 (c);
4553 EMIT_ONE_BYTE (c);
4555 else
4557 struct charset *charset;
4559 if (preferred_charset_id >= 0)
4561 bool result;
4563 charset = CHARSET_FROM_ID (preferred_charset_id);
4564 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4565 if (! result)
4566 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4567 NULL, charset);
4569 else
4570 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4571 NULL, charset);
4572 if (!charset)
4574 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4576 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4577 charset = CHARSET_FROM_ID (charset_ascii);
4579 else
4581 c = coding->default_char;
4582 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4583 charset_list, NULL, charset);
4586 ENCODE_ISO_CHARACTER (charset, c);
4590 if (coding->mode & CODING_MODE_LAST_BLOCK
4591 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4593 ASSURE_DESTINATION (safe_room);
4594 ENCODE_RESET_PLANE_AND_REGISTER ();
4596 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4597 CODING_ISO_BOL (coding) = bol_designation;
4598 coding->produced_char += produced_chars;
4599 coding->produced = dst - coding->destination;
4600 return 0;
4604 /*** 8,9. SJIS and BIG5 handlers ***/
4606 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4607 quite widely. So, for the moment, Emacs supports them in the bare
4608 C code. But, in the future, they may be supported only by CCL. */
4610 /* SJIS is a coding system encoding three character sets: ASCII, right
4611 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4612 as is. A character of charset katakana-jisx0201 is encoded by
4613 "position-code + 0x80". A character of charset japanese-jisx0208
4614 is encoded in 2-byte but two position-codes are divided and shifted
4615 so that it fit in the range below.
4617 --- CODE RANGE of SJIS ---
4618 (character set) (range)
4619 ASCII 0x00 .. 0x7F
4620 KATAKANA-JISX0201 0xA0 .. 0xDF
4621 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4622 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4623 -------------------------------
4627 /* BIG5 is a coding system encoding two character sets: ASCII and
4628 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4629 character set and is encoded in two-byte.
4631 --- CODE RANGE of BIG5 ---
4632 (character set) (range)
4633 ASCII 0x00 .. 0x7F
4634 Big5 (1st byte) 0xA1 .. 0xFE
4635 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4636 --------------------------
4640 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4641 Return true if a text is encoded in SJIS. */
4643 static bool
4644 detect_coding_sjis (struct coding_system *coding,
4645 struct coding_detection_info *detect_info)
4647 const unsigned char *src = coding->source, *src_base;
4648 const unsigned char *src_end = coding->source + coding->src_bytes;
4649 bool multibytep = coding->src_multibyte;
4650 ptrdiff_t consumed_chars = 0;
4651 int found = 0;
4652 int c;
4653 Lisp_Object attrs, charset_list;
4654 int max_first_byte_of_2_byte_code;
4656 CODING_GET_INFO (coding, attrs, charset_list);
4657 max_first_byte_of_2_byte_code
4658 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4660 detect_info->checked |= CATEGORY_MASK_SJIS;
4661 /* A coding system of this category is always ASCII compatible. */
4662 src += coding->head_ascii;
4664 while (1)
4666 src_base = src;
4667 ONE_MORE_BYTE (c);
4668 if (c < 0x80)
4669 continue;
4670 if ((c >= 0x81 && c <= 0x9F)
4671 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4673 ONE_MORE_BYTE (c);
4674 if (c < 0x40 || c == 0x7F || c > 0xFC)
4675 break;
4676 found = CATEGORY_MASK_SJIS;
4678 else if (c >= 0xA0 && c < 0xE0)
4679 found = CATEGORY_MASK_SJIS;
4680 else
4681 break;
4683 detect_info->rejected |= CATEGORY_MASK_SJIS;
4684 return 0;
4686 no_more_source:
4687 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4689 detect_info->rejected |= CATEGORY_MASK_SJIS;
4690 return 0;
4692 detect_info->found |= found;
4693 return 1;
4696 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4697 Return true if a text is encoded in BIG5. */
4699 static bool
4700 detect_coding_big5 (struct coding_system *coding,
4701 struct coding_detection_info *detect_info)
4703 const unsigned char *src = coding->source, *src_base;
4704 const unsigned char *src_end = coding->source + coding->src_bytes;
4705 bool multibytep = coding->src_multibyte;
4706 ptrdiff_t consumed_chars = 0;
4707 int found = 0;
4708 int c;
4710 detect_info->checked |= CATEGORY_MASK_BIG5;
4711 /* A coding system of this category is always ASCII compatible. */
4712 src += coding->head_ascii;
4714 while (1)
4716 src_base = src;
4717 ONE_MORE_BYTE (c);
4718 if (c < 0x80)
4719 continue;
4720 if (c >= 0xA1)
4722 ONE_MORE_BYTE (c);
4723 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4724 return 0;
4725 found = CATEGORY_MASK_BIG5;
4727 else
4728 break;
4730 detect_info->rejected |= CATEGORY_MASK_BIG5;
4731 return 0;
4733 no_more_source:
4734 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4736 detect_info->rejected |= CATEGORY_MASK_BIG5;
4737 return 0;
4739 detect_info->found |= found;
4740 return 1;
4743 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4745 static void
4746 decode_coding_sjis (struct coding_system *coding)
4748 const unsigned char *src = coding->source + coding->consumed;
4749 const unsigned char *src_end = coding->source + coding->src_bytes;
4750 const unsigned char *src_base;
4751 int *charbuf = coding->charbuf + coding->charbuf_used;
4752 /* We may produce one charset annotation in one loop and one more at
4753 the end. */
4754 int *charbuf_end
4755 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4756 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4757 bool multibytep = coding->src_multibyte;
4758 struct charset *charset_roman, *charset_kanji, *charset_kana;
4759 struct charset *charset_kanji2;
4760 Lisp_Object attrs, charset_list, val;
4761 ptrdiff_t char_offset = coding->produced_char;
4762 ptrdiff_t last_offset = char_offset;
4763 int last_id = charset_ascii;
4764 bool eol_dos
4765 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4766 int byte_after_cr = -1;
4768 CODING_GET_INFO (coding, attrs, charset_list);
4770 val = charset_list;
4771 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4772 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4773 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4774 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4776 while (1)
4778 int c, c1;
4779 struct charset *charset;
4781 src_base = src;
4782 consumed_chars_base = consumed_chars;
4784 if (charbuf >= charbuf_end)
4786 if (byte_after_cr >= 0)
4787 src_base--;
4788 break;
4791 if (byte_after_cr >= 0)
4792 c = byte_after_cr, byte_after_cr = -1;
4793 else
4794 ONE_MORE_BYTE (c);
4795 if (c < 0)
4796 goto invalid_code;
4797 if (c < 0x80)
4799 if (eol_dos && c == '\r')
4800 ONE_MORE_BYTE (byte_after_cr);
4801 charset = charset_roman;
4803 else if (c == 0x80 || c == 0xA0)
4804 goto invalid_code;
4805 else if (c >= 0xA1 && c <= 0xDF)
4807 /* SJIS -> JISX0201-Kana */
4808 c &= 0x7F;
4809 charset = charset_kana;
4811 else if (c <= 0xEF)
4813 /* SJIS -> JISX0208 */
4814 ONE_MORE_BYTE (c1);
4815 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4816 goto invalid_code;
4817 c = (c << 8) | c1;
4818 SJIS_TO_JIS (c);
4819 charset = charset_kanji;
4821 else if (c <= 0xFC && charset_kanji2)
4823 /* SJIS -> JISX0213-2 */
4824 ONE_MORE_BYTE (c1);
4825 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4826 goto invalid_code;
4827 c = (c << 8) | c1;
4828 SJIS_TO_JIS2 (c);
4829 charset = charset_kanji2;
4831 else
4832 goto invalid_code;
4833 if (charset->id != charset_ascii
4834 && last_id != charset->id)
4836 if (last_id != charset_ascii)
4837 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4838 last_id = charset->id;
4839 last_offset = char_offset;
4841 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4842 *charbuf++ = c;
4843 char_offset++;
4844 continue;
4846 invalid_code:
4847 src = src_base;
4848 consumed_chars = consumed_chars_base;
4849 ONE_MORE_BYTE (c);
4850 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4851 char_offset++;
4854 no_more_source:
4855 if (last_id != charset_ascii)
4856 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857 coding->consumed_char += consumed_chars_base;
4858 coding->consumed = src_base - coding->source;
4859 coding->charbuf_used = charbuf - coding->charbuf;
4862 static void
4863 decode_coding_big5 (struct coding_system *coding)
4865 const unsigned char *src = coding->source + coding->consumed;
4866 const unsigned char *src_end = coding->source + coding->src_bytes;
4867 const unsigned char *src_base;
4868 int *charbuf = coding->charbuf + coding->charbuf_used;
4869 /* We may produce one charset annotation in one loop and one more at
4870 the end. */
4871 int *charbuf_end
4872 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4873 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4874 bool multibytep = coding->src_multibyte;
4875 struct charset *charset_roman, *charset_big5;
4876 Lisp_Object attrs, charset_list, val;
4877 ptrdiff_t char_offset = coding->produced_char;
4878 ptrdiff_t last_offset = char_offset;
4879 int last_id = charset_ascii;
4880 bool eol_dos
4881 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4882 int byte_after_cr = -1;
4884 CODING_GET_INFO (coding, attrs, charset_list);
4885 val = charset_list;
4886 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4887 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4889 while (1)
4891 int c, c1;
4892 struct charset *charset;
4894 src_base = src;
4895 consumed_chars_base = consumed_chars;
4897 if (charbuf >= charbuf_end)
4899 if (byte_after_cr >= 0)
4900 src_base--;
4901 break;
4904 if (byte_after_cr >= 0)
4905 c = byte_after_cr, byte_after_cr = -1;
4906 else
4907 ONE_MORE_BYTE (c);
4909 if (c < 0)
4910 goto invalid_code;
4911 if (c < 0x80)
4913 if (eol_dos && c == '\r')
4914 ONE_MORE_BYTE (byte_after_cr);
4915 charset = charset_roman;
4917 else
4919 /* BIG5 -> Big5 */
4920 if (c < 0xA1 || c > 0xFE)
4921 goto invalid_code;
4922 ONE_MORE_BYTE (c1);
4923 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4924 goto invalid_code;
4925 c = c << 8 | c1;
4926 charset = charset_big5;
4928 if (charset->id != charset_ascii
4929 && last_id != charset->id)
4931 if (last_id != charset_ascii)
4932 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4933 last_id = charset->id;
4934 last_offset = char_offset;
4936 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4937 *charbuf++ = c;
4938 char_offset++;
4939 continue;
4941 invalid_code:
4942 src = src_base;
4943 consumed_chars = consumed_chars_base;
4944 ONE_MORE_BYTE (c);
4945 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4946 char_offset++;
4949 no_more_source:
4950 if (last_id != charset_ascii)
4951 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4952 coding->consumed_char += consumed_chars_base;
4953 coding->consumed = src_base - coding->source;
4954 coding->charbuf_used = charbuf - coding->charbuf;
4957 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4958 This function can encode charsets `ascii', `katakana-jisx0201',
4959 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4960 are sure that all these charsets are registered as official charset
4961 (i.e. do not have extended leading-codes). Characters of other
4962 charsets are produced without any encoding. */
4964 static bool
4965 encode_coding_sjis (struct coding_system *coding)
4967 bool multibytep = coding->dst_multibyte;
4968 int *charbuf = coding->charbuf;
4969 int *charbuf_end = charbuf + coding->charbuf_used;
4970 unsigned char *dst = coding->destination + coding->produced;
4971 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4972 int safe_room = 4;
4973 ptrdiff_t produced_chars = 0;
4974 Lisp_Object attrs, charset_list, val;
4975 bool ascii_compatible;
4976 struct charset *charset_kanji, *charset_kana;
4977 struct charset *charset_kanji2;
4978 int c;
4980 CODING_GET_INFO (coding, attrs, charset_list);
4981 val = XCDR (charset_list);
4982 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4984 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4986 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4988 while (charbuf < charbuf_end)
4990 ASSURE_DESTINATION (safe_room);
4991 c = *charbuf++;
4992 /* Now encode the character C. */
4993 if (ASCII_CHAR_P (c) && ascii_compatible)
4994 EMIT_ONE_ASCII_BYTE (c);
4995 else if (CHAR_BYTE8_P (c))
4997 c = CHAR_TO_BYTE8 (c);
4998 EMIT_ONE_BYTE (c);
5000 else
5002 unsigned code;
5003 struct charset *charset;
5004 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5005 &code, charset);
5007 if (!charset)
5009 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5011 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5012 charset = CHARSET_FROM_ID (charset_ascii);
5014 else
5016 c = coding->default_char;
5017 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5018 charset_list, &code, charset);
5021 if (code == CHARSET_INVALID_CODE (charset))
5022 emacs_abort ();
5023 if (charset == charset_kanji)
5025 int c1, c2;
5026 JIS_TO_SJIS (code);
5027 c1 = code >> 8, c2 = code & 0xFF;
5028 EMIT_TWO_BYTES (c1, c2);
5030 else if (charset == charset_kana)
5031 EMIT_ONE_BYTE (code | 0x80);
5032 else if (charset_kanji2 && charset == charset_kanji2)
5034 int c1, c2;
5036 c1 = code >> 8;
5037 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5038 || c1 == 0x28
5039 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5041 JIS_TO_SJIS2 (code);
5042 c1 = code >> 8, c2 = code & 0xFF;
5043 EMIT_TWO_BYTES (c1, c2);
5045 else
5046 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5048 else
5049 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5052 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5053 coding->produced_char += produced_chars;
5054 coding->produced = dst - coding->destination;
5055 return 0;
5058 static bool
5059 encode_coding_big5 (struct coding_system *coding)
5061 bool multibytep = coding->dst_multibyte;
5062 int *charbuf = coding->charbuf;
5063 int *charbuf_end = charbuf + coding->charbuf_used;
5064 unsigned char *dst = coding->destination + coding->produced;
5065 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5066 int safe_room = 4;
5067 ptrdiff_t produced_chars = 0;
5068 Lisp_Object attrs, charset_list, val;
5069 bool ascii_compatible;
5070 struct charset *charset_big5;
5071 int c;
5073 CODING_GET_INFO (coding, attrs, charset_list);
5074 val = XCDR (charset_list);
5075 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5076 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5078 while (charbuf < charbuf_end)
5080 ASSURE_DESTINATION (safe_room);
5081 c = *charbuf++;
5082 /* Now encode the character C. */
5083 if (ASCII_CHAR_P (c) && ascii_compatible)
5084 EMIT_ONE_ASCII_BYTE (c);
5085 else if (CHAR_BYTE8_P (c))
5087 c = CHAR_TO_BYTE8 (c);
5088 EMIT_ONE_BYTE (c);
5090 else
5092 unsigned code;
5093 struct charset *charset;
5094 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5095 &code, charset);
5097 if (! charset)
5099 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5101 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5102 charset = CHARSET_FROM_ID (charset_ascii);
5104 else
5106 c = coding->default_char;
5107 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5108 charset_list, &code, charset);
5111 if (code == CHARSET_INVALID_CODE (charset))
5112 emacs_abort ();
5113 if (charset == charset_big5)
5115 int c1, c2;
5117 c1 = code >> 8, c2 = code & 0xFF;
5118 EMIT_TWO_BYTES (c1, c2);
5120 else
5121 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5124 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5125 coding->produced_char += produced_chars;
5126 coding->produced = dst - coding->destination;
5127 return 0;
5131 /*** 10. CCL handlers ***/
5133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5134 Return true if a text is encoded in a coding system of which
5135 encoder/decoder are written in CCL program. */
5137 static bool
5138 detect_coding_ccl (struct coding_system *coding,
5139 struct coding_detection_info *detect_info)
5141 const unsigned char *src = coding->source, *src_base;
5142 const unsigned char *src_end = coding->source + coding->src_bytes;
5143 bool multibytep = coding->src_multibyte;
5144 ptrdiff_t consumed_chars = 0;
5145 int found = 0;
5146 unsigned char *valids;
5147 ptrdiff_t head_ascii = coding->head_ascii;
5148 Lisp_Object attrs;
5150 detect_info->checked |= CATEGORY_MASK_CCL;
5152 coding = &coding_categories[coding_category_ccl];
5153 valids = CODING_CCL_VALIDS (coding);
5154 attrs = CODING_ID_ATTRS (coding->id);
5155 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156 src += head_ascii;
5158 while (1)
5160 int c;
5162 src_base = src;
5163 ONE_MORE_BYTE (c);
5164 if (c < 0 || ! valids[c])
5165 break;
5166 if ((valids[c] > 1))
5167 found = CATEGORY_MASK_CCL;
5169 detect_info->rejected |= CATEGORY_MASK_CCL;
5170 return 0;
5172 no_more_source:
5173 detect_info->found |= found;
5174 return 1;
5177 static void
5178 decode_coding_ccl (struct coding_system *coding)
5180 const unsigned char *src = coding->source + coding->consumed;
5181 const unsigned char *src_end = coding->source + coding->src_bytes;
5182 int *charbuf = coding->charbuf + coding->charbuf_used;
5183 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5184 ptrdiff_t consumed_chars = 0;
5185 bool multibytep = coding->src_multibyte;
5186 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5187 int source_charbuf[1024];
5188 int source_byteidx[1025];
5189 Lisp_Object attrs, charset_list;
5191 CODING_GET_INFO (coding, attrs, charset_list);
5193 while (1)
5195 const unsigned char *p = src;
5196 ptrdiff_t offset;
5197 int i = 0;
5199 if (multibytep)
5201 while (i < 1024 && p < src_end)
5203 source_byteidx[i] = p - src;
5204 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5206 source_byteidx[i] = p - src;
5208 else
5209 while (i < 1024 && p < src_end)
5210 source_charbuf[i++] = *p++;
5212 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5213 ccl->last_block = true;
5214 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5215 charset_map_loaded = 0;
5216 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5217 charset_list);
5218 if (charset_map_loaded
5219 && (offset = coding_change_source (coding)))
5221 p += offset;
5222 src += offset;
5223 src_end += offset;
5225 charbuf += ccl->produced;
5226 if (multibytep)
5227 src += source_byteidx[ccl->consumed];
5228 else
5229 src += ccl->consumed;
5230 consumed_chars += ccl->consumed;
5231 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5232 break;
5235 switch (ccl->status)
5237 case CCL_STAT_SUSPEND_BY_SRC:
5238 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5239 break;
5240 case CCL_STAT_SUSPEND_BY_DST:
5241 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5242 break;
5243 case CCL_STAT_QUIT:
5244 case CCL_STAT_INVALID_CMD:
5245 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5246 break;
5247 default:
5248 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5249 break;
5251 coding->consumed_char += consumed_chars;
5252 coding->consumed = src - coding->source;
5253 coding->charbuf_used = charbuf - coding->charbuf;
5256 static bool
5257 encode_coding_ccl (struct coding_system *coding)
5259 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5260 bool multibytep = coding->dst_multibyte;
5261 int *charbuf = coding->charbuf;
5262 int *charbuf_end = charbuf + coding->charbuf_used;
5263 unsigned char *dst = coding->destination + coding->produced;
5264 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5265 int destination_charbuf[1024];
5266 ptrdiff_t produced_chars = 0;
5267 int i;
5268 Lisp_Object attrs, charset_list;
5270 CODING_GET_INFO (coding, attrs, charset_list);
5271 if (coding->consumed_char == coding->src_chars
5272 && coding->mode & CODING_MODE_LAST_BLOCK)
5273 ccl->last_block = true;
5277 ptrdiff_t offset;
5279 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5280 charset_map_loaded = 0;
5281 ccl_driver (ccl, charbuf, destination_charbuf,
5282 charbuf_end - charbuf, 1024, charset_list);
5283 if (charset_map_loaded
5284 && (offset = coding_change_destination (coding)))
5285 dst += offset;
5286 if (multibytep)
5288 ASSURE_DESTINATION (ccl->produced * 2);
5289 for (i = 0; i < ccl->produced; i++)
5290 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5292 else
5294 ASSURE_DESTINATION (ccl->produced);
5295 for (i = 0; i < ccl->produced; i++)
5296 *dst++ = destination_charbuf[i] & 0xFF;
5297 produced_chars += ccl->produced;
5299 charbuf += ccl->consumed;
5300 if (ccl->status == CCL_STAT_QUIT
5301 || ccl->status == CCL_STAT_INVALID_CMD)
5302 break;
5304 while (charbuf < charbuf_end);
5306 switch (ccl->status)
5308 case CCL_STAT_SUSPEND_BY_SRC:
5309 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5310 break;
5311 case CCL_STAT_SUSPEND_BY_DST:
5312 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5313 break;
5314 case CCL_STAT_QUIT:
5315 case CCL_STAT_INVALID_CMD:
5316 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5317 break;
5318 default:
5319 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5320 break;
5323 coding->produced_char += produced_chars;
5324 coding->produced = dst - coding->destination;
5325 return 0;
5329 /*** 10, 11. no-conversion handlers ***/
5331 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5333 static void
5334 decode_coding_raw_text (struct coding_system *coding)
5336 bool eol_dos
5337 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5339 coding->chars_at_source = 1;
5340 coding->consumed_char = coding->src_chars;
5341 coding->consumed = coding->src_bytes;
5342 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5344 coding->consumed_char--;
5345 coding->consumed--;
5346 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5348 else
5349 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5352 static bool
5353 encode_coding_raw_text (struct coding_system *coding)
5355 bool multibytep = coding->dst_multibyte;
5356 int *charbuf = coding->charbuf;
5357 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5358 unsigned char *dst = coding->destination + coding->produced;
5359 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5360 ptrdiff_t produced_chars = 0;
5361 int c;
5363 if (multibytep)
5365 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5367 if (coding->src_multibyte)
5368 while (charbuf < charbuf_end)
5370 ASSURE_DESTINATION (safe_room);
5371 c = *charbuf++;
5372 if (ASCII_CHAR_P (c))
5373 EMIT_ONE_ASCII_BYTE (c);
5374 else if (CHAR_BYTE8_P (c))
5376 c = CHAR_TO_BYTE8 (c);
5377 EMIT_ONE_BYTE (c);
5379 else
5381 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5383 CHAR_STRING_ADVANCE (c, p1);
5386 EMIT_ONE_BYTE (*p0);
5387 p0++;
5389 while (p0 < p1);
5392 else
5393 while (charbuf < charbuf_end)
5395 ASSURE_DESTINATION (safe_room);
5396 c = *charbuf++;
5397 EMIT_ONE_BYTE (c);
5400 else
5402 if (coding->src_multibyte)
5404 int safe_room = MAX_MULTIBYTE_LENGTH;
5406 while (charbuf < charbuf_end)
5408 ASSURE_DESTINATION (safe_room);
5409 c = *charbuf++;
5410 if (ASCII_CHAR_P (c))
5411 *dst++ = c;
5412 else if (CHAR_BYTE8_P (c))
5413 *dst++ = CHAR_TO_BYTE8 (c);
5414 else
5415 CHAR_STRING_ADVANCE (c, dst);
5418 else
5420 ASSURE_DESTINATION (charbuf_end - charbuf);
5421 while (charbuf < charbuf_end && dst < dst_end)
5422 *dst++ = *charbuf++;
5424 produced_chars = dst - (coding->destination + coding->produced);
5426 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5427 coding->produced_char += produced_chars;
5428 coding->produced = dst - coding->destination;
5429 return 0;
5432 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5433 Return true if a text is encoded in a charset-based coding system. */
5435 static bool
5436 detect_coding_charset (struct coding_system *coding,
5437 struct coding_detection_info *detect_info)
5439 const unsigned char *src = coding->source, *src_base;
5440 const unsigned char *src_end = coding->source + coding->src_bytes;
5441 bool multibytep = coding->src_multibyte;
5442 ptrdiff_t consumed_chars = 0;
5443 Lisp_Object attrs, valids, name;
5444 int found = 0;
5445 ptrdiff_t head_ascii = coding->head_ascii;
5446 bool check_latin_extra = 0;
5448 detect_info->checked |= CATEGORY_MASK_CHARSET;
5450 coding = &coding_categories[coding_category_charset];
5451 attrs = CODING_ID_ATTRS (coding->id);
5452 valids = AREF (attrs, coding_attr_charset_valids);
5453 name = CODING_ID_NAME (coding->id);
5454 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5455 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5456 || strncmp (SSDATA (SYMBOL_NAME (name)),
5457 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5458 check_latin_extra = 1;
5460 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5461 src += head_ascii;
5463 while (1)
5465 int c;
5466 Lisp_Object val;
5467 struct charset *charset;
5468 int dim, idx;
5470 src_base = src;
5471 ONE_MORE_BYTE (c);
5472 if (c < 0)
5473 continue;
5474 val = AREF (valids, c);
5475 if (NILP (val))
5476 break;
5477 if (c >= 0x80)
5479 if (c < 0xA0
5480 && check_latin_extra
5481 && (!VECTORP (Vlatin_extra_code_table)
5482 || NILP (AREF (Vlatin_extra_code_table, c))))
5483 break;
5484 found = CATEGORY_MASK_CHARSET;
5486 if (INTEGERP (val))
5488 charset = CHARSET_FROM_ID (XFASTINT (val));
5489 dim = CHARSET_DIMENSION (charset);
5490 for (idx = 1; idx < dim; idx++)
5492 if (src == src_end)
5493 goto too_short;
5494 ONE_MORE_BYTE (c);
5495 if (c < charset->code_space[(dim - 1 - idx) * 4]
5496 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5497 break;
5499 if (idx < dim)
5500 break;
5502 else
5504 idx = 1;
5505 for (; CONSP (val); val = XCDR (val))
5507 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5508 dim = CHARSET_DIMENSION (charset);
5509 while (idx < dim)
5511 if (src == src_end)
5512 goto too_short;
5513 ONE_MORE_BYTE (c);
5514 if (c < charset->code_space[(dim - 1 - idx) * 4]
5515 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5516 break;
5517 idx++;
5519 if (idx == dim)
5521 val = Qnil;
5522 break;
5525 if (CONSP (val))
5526 break;
5529 too_short:
5530 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5531 return 0;
5533 no_more_source:
5534 detect_info->found |= found;
5535 return 1;
5538 static void
5539 decode_coding_charset (struct coding_system *coding)
5541 const unsigned char *src = coding->source + coding->consumed;
5542 const unsigned char *src_end = coding->source + coding->src_bytes;
5543 const unsigned char *src_base;
5544 int *charbuf = coding->charbuf + coding->charbuf_used;
5545 /* We may produce one charset annotation in one loop and one more at
5546 the end. */
5547 int *charbuf_end
5548 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5549 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5550 bool multibytep = coding->src_multibyte;
5551 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5552 Lisp_Object valids;
5553 ptrdiff_t char_offset = coding->produced_char;
5554 ptrdiff_t last_offset = char_offset;
5555 int last_id = charset_ascii;
5556 bool eol_dos
5557 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5558 int byte_after_cr = -1;
5560 valids = AREF (attrs, coding_attr_charset_valids);
5562 while (1)
5564 int c;
5565 Lisp_Object val;
5566 struct charset *charset;
5567 int dim;
5568 int len = 1;
5569 unsigned code;
5571 src_base = src;
5572 consumed_chars_base = consumed_chars;
5574 if (charbuf >= charbuf_end)
5576 if (byte_after_cr >= 0)
5577 src_base--;
5578 break;
5581 if (byte_after_cr >= 0)
5583 c = byte_after_cr;
5584 byte_after_cr = -1;
5586 else
5588 ONE_MORE_BYTE (c);
5589 if (eol_dos && c == '\r')
5590 ONE_MORE_BYTE (byte_after_cr);
5592 if (c < 0)
5593 goto invalid_code;
5594 code = c;
5596 val = AREF (valids, c);
5597 if (! INTEGERP (val) && ! CONSP (val))
5598 goto invalid_code;
5599 if (INTEGERP (val))
5601 charset = CHARSET_FROM_ID (XFASTINT (val));
5602 dim = CHARSET_DIMENSION (charset);
5603 while (len < dim)
5605 ONE_MORE_BYTE (c);
5606 code = (code << 8) | c;
5607 len++;
5609 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5610 charset, code, c);
5612 else
5614 /* VAL is a list of charset IDs. It is assured that the
5615 list is sorted by charset dimensions (smaller one
5616 comes first). */
5617 while (CONSP (val))
5619 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5620 dim = CHARSET_DIMENSION (charset);
5621 while (len < dim)
5623 ONE_MORE_BYTE (c);
5624 code = (code << 8) | c;
5625 len++;
5627 CODING_DECODE_CHAR (coding, src, src_base,
5628 src_end, charset, code, c);
5629 if (c >= 0)
5630 break;
5631 val = XCDR (val);
5634 if (c < 0)
5635 goto invalid_code;
5636 if (charset->id != charset_ascii
5637 && last_id != charset->id)
5639 if (last_id != charset_ascii)
5640 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5641 last_id = charset->id;
5642 last_offset = char_offset;
5645 *charbuf++ = c;
5646 char_offset++;
5647 continue;
5649 invalid_code:
5650 src = src_base;
5651 consumed_chars = consumed_chars_base;
5652 ONE_MORE_BYTE (c);
5653 *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5654 char_offset++;
5657 no_more_source:
5658 if (last_id != charset_ascii)
5659 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5660 coding->consumed_char += consumed_chars_base;
5661 coding->consumed = src_base - coding->source;
5662 coding->charbuf_used = charbuf - coding->charbuf;
5665 static bool
5666 encode_coding_charset (struct coding_system *coding)
5668 bool multibytep = coding->dst_multibyte;
5669 int *charbuf = coding->charbuf;
5670 int *charbuf_end = charbuf + coding->charbuf_used;
5671 unsigned char *dst = coding->destination + coding->produced;
5672 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5673 int safe_room = MAX_MULTIBYTE_LENGTH;
5674 ptrdiff_t produced_chars = 0;
5675 Lisp_Object attrs, charset_list;
5676 bool ascii_compatible;
5677 int c;
5679 CODING_GET_INFO (coding, attrs, charset_list);
5680 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5682 while (charbuf < charbuf_end)
5684 struct charset *charset;
5685 unsigned code;
5687 ASSURE_DESTINATION (safe_room);
5688 c = *charbuf++;
5689 if (ascii_compatible && ASCII_CHAR_P (c))
5690 EMIT_ONE_ASCII_BYTE (c);
5691 else if (CHAR_BYTE8_P (c))
5693 c = CHAR_TO_BYTE8 (c);
5694 EMIT_ONE_BYTE (c);
5696 else
5698 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5699 &code, charset);
5701 if (charset)
5703 if (CHARSET_DIMENSION (charset) == 1)
5704 EMIT_ONE_BYTE (code);
5705 else if (CHARSET_DIMENSION (charset) == 2)
5706 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5707 else if (CHARSET_DIMENSION (charset) == 3)
5708 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5709 else
5710 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5711 (code >> 8) & 0xFF, code & 0xFF);
5713 else
5715 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5716 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5717 else
5718 c = coding->default_char;
5719 EMIT_ONE_BYTE (c);
5724 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5725 coding->produced_char += produced_chars;
5726 coding->produced = dst - coding->destination;
5727 return 0;
5731 /*** 7. C library functions ***/
5733 /* Setup coding context CODING from information about CODING_SYSTEM.
5734 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5735 CODING_SYSTEM is invalid, signal an error. */
5737 void
5738 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5740 Lisp_Object attrs;
5741 Lisp_Object eol_type;
5742 Lisp_Object coding_type;
5743 Lisp_Object val;
5745 if (NILP (coding_system))
5746 coding_system = Qundecided;
5748 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5750 attrs = CODING_ID_ATTRS (coding->id);
5751 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5753 coding->mode = 0;
5754 if (VECTORP (eol_type))
5755 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5756 | CODING_REQUIRE_DETECTION_MASK);
5757 else if (! EQ (eol_type, Qunix))
5758 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5759 | CODING_REQUIRE_ENCODING_MASK);
5760 else
5761 coding->common_flags = 0;
5762 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5763 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5764 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5765 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5766 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5767 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5769 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5770 coding->max_charset_id = SCHARS (val) - 1;
5771 coding->safe_charsets = SDATA (val);
5772 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5773 coding->carryover_bytes = 0;
5774 coding->raw_destination = 0;
5776 coding_type = CODING_ATTR_TYPE (attrs);
5777 if (EQ (coding_type, Qundecided))
5779 coding->detector = NULL;
5780 coding->decoder = decode_coding_raw_text;
5781 coding->encoder = encode_coding_raw_text;
5782 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5783 coding->spec.undecided.inhibit_nbd
5784 = (encode_inhibit_flag
5785 (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5786 coding->spec.undecided.inhibit_ied
5787 = (encode_inhibit_flag
5788 (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5789 coding->spec.undecided.prefer_utf_8
5790 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5792 else if (EQ (coding_type, Qiso_2022))
5794 int i;
5795 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5797 /* Invoke graphic register 0 to plane 0. */
5798 CODING_ISO_INVOCATION (coding, 0) = 0;
5799 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5800 CODING_ISO_INVOCATION (coding, 1)
5801 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5802 /* Setup the initial status of designation. */
5803 for (i = 0; i < 4; i++)
5804 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5805 /* Not single shifting initially. */
5806 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5807 /* Beginning of buffer should also be regarded as bol. */
5808 CODING_ISO_BOL (coding) = 1;
5809 coding->detector = detect_coding_iso_2022;
5810 coding->decoder = decode_coding_iso_2022;
5811 coding->encoder = encode_coding_iso_2022;
5812 if (flags & CODING_ISO_FLAG_SAFE)
5813 coding->mode |= CODING_MODE_SAFE_ENCODING;
5814 coding->common_flags
5815 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5816 | CODING_REQUIRE_FLUSHING_MASK);
5817 if (flags & CODING_ISO_FLAG_COMPOSITION)
5818 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5819 if (flags & CODING_ISO_FLAG_DESIGNATION)
5820 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5821 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5823 setup_iso_safe_charsets (attrs);
5824 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5825 coding->max_charset_id = SCHARS (val) - 1;
5826 coding->safe_charsets = SDATA (val);
5828 CODING_ISO_FLAGS (coding) = flags;
5829 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5830 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5831 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5832 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5834 else if (EQ (coding_type, Qcharset))
5836 coding->detector = detect_coding_charset;
5837 coding->decoder = decode_coding_charset;
5838 coding->encoder = encode_coding_charset;
5839 coding->common_flags
5840 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5842 else if (EQ (coding_type, Qutf_8))
5844 val = AREF (attrs, coding_attr_utf_bom);
5845 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5846 : EQ (val, Qt) ? utf_with_bom
5847 : utf_without_bom);
5848 coding->detector = detect_coding_utf_8;
5849 coding->decoder = decode_coding_utf_8;
5850 coding->encoder = encode_coding_utf_8;
5851 coding->common_flags
5852 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5853 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5854 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5856 else if (EQ (coding_type, Qutf_16))
5858 val = AREF (attrs, coding_attr_utf_bom);
5859 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5860 : EQ (val, Qt) ? utf_with_bom
5861 : utf_without_bom);
5862 val = AREF (attrs, coding_attr_utf_16_endian);
5863 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5864 : utf_16_little_endian);
5865 CODING_UTF_16_SURROGATE (coding) = 0;
5866 coding->detector = detect_coding_utf_16;
5867 coding->decoder = decode_coding_utf_16;
5868 coding->encoder = encode_coding_utf_16;
5869 coding->common_flags
5870 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5871 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5872 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5874 else if (EQ (coding_type, Qccl))
5876 coding->detector = detect_coding_ccl;
5877 coding->decoder = decode_coding_ccl;
5878 coding->encoder = encode_coding_ccl;
5879 coding->common_flags
5880 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5881 | CODING_REQUIRE_FLUSHING_MASK);
5883 else if (EQ (coding_type, Qemacs_mule))
5885 coding->detector = detect_coding_emacs_mule;
5886 coding->decoder = decode_coding_emacs_mule;
5887 coding->encoder = encode_coding_emacs_mule;
5888 coding->common_flags
5889 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5890 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5891 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5893 Lisp_Object tail, safe_charsets;
5894 int max_charset_id = 0;
5896 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5897 tail = XCDR (tail))
5898 if (max_charset_id < XFASTINT (XCAR (tail)))
5899 max_charset_id = XFASTINT (XCAR (tail));
5900 safe_charsets = make_uninit_string (max_charset_id + 1);
5901 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5902 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5903 tail = XCDR (tail))
5904 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5905 coding->max_charset_id = max_charset_id;
5906 coding->safe_charsets = SDATA (safe_charsets);
5908 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5909 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5911 else if (EQ (coding_type, Qshift_jis))
5913 coding->detector = detect_coding_sjis;
5914 coding->decoder = decode_coding_sjis;
5915 coding->encoder = encode_coding_sjis;
5916 coding->common_flags
5917 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5919 else if (EQ (coding_type, Qbig5))
5921 coding->detector = detect_coding_big5;
5922 coding->decoder = decode_coding_big5;
5923 coding->encoder = encode_coding_big5;
5924 coding->common_flags
5925 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5927 else /* EQ (coding_type, Qraw_text) */
5929 coding->detector = NULL;
5930 coding->decoder = decode_coding_raw_text;
5931 coding->encoder = encode_coding_raw_text;
5932 if (! EQ (eol_type, Qunix))
5934 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5935 if (! VECTORP (eol_type))
5936 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5941 return;
5944 /* Return a list of charsets supported by CODING. */
5946 Lisp_Object
5947 coding_charset_list (struct coding_system *coding)
5949 Lisp_Object attrs, charset_list;
5951 CODING_GET_INFO (coding, attrs, charset_list);
5952 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5954 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5956 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5957 charset_list = Viso_2022_charset_list;
5959 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5961 charset_list = Vemacs_mule_charset_list;
5963 return charset_list;
5967 /* Return a list of charsets supported by CODING-SYSTEM. */
5969 Lisp_Object
5970 coding_system_charset_list (Lisp_Object coding_system)
5972 ptrdiff_t id;
5973 Lisp_Object attrs, charset_list;
5975 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5976 attrs = CODING_ID_ATTRS (id);
5978 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5980 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5982 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5983 charset_list = Viso_2022_charset_list;
5984 else
5985 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5987 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5989 charset_list = Vemacs_mule_charset_list;
5991 else
5993 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5995 return charset_list;
5999 /* Return raw-text or one of its subsidiaries that has the same
6000 eol_type as CODING-SYSTEM. */
6002 Lisp_Object
6003 raw_text_coding_system (Lisp_Object coding_system)
6005 Lisp_Object spec, attrs;
6006 Lisp_Object eol_type, raw_text_eol_type;
6008 if (NILP (coding_system))
6009 return Qraw_text;
6010 spec = CODING_SYSTEM_SPEC (coding_system);
6011 attrs = AREF (spec, 0);
6013 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6014 return coding_system;
6016 eol_type = AREF (spec, 2);
6017 if (VECTORP (eol_type))
6018 return Qraw_text;
6019 spec = CODING_SYSTEM_SPEC (Qraw_text);
6020 raw_text_eol_type = AREF (spec, 2);
6021 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6022 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6023 : AREF (raw_text_eol_type, 2));
6027 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6028 the subsidiary that has the same eol-spec as PARENT (if it is not
6029 nil and specifies end-of-line format) or the system's setting
6030 (system_eol_type). */
6032 Lisp_Object
6033 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6035 Lisp_Object spec, eol_type;
6037 if (NILP (coding_system))
6038 coding_system = Qraw_text;
6039 spec = CODING_SYSTEM_SPEC (coding_system);
6040 eol_type = AREF (spec, 2);
6041 if (VECTORP (eol_type))
6043 Lisp_Object parent_eol_type;
6045 if (! NILP (parent))
6047 Lisp_Object parent_spec;
6049 parent_spec = CODING_SYSTEM_SPEC (parent);
6050 parent_eol_type = AREF (parent_spec, 2);
6051 if (VECTORP (parent_eol_type))
6052 parent_eol_type = system_eol_type;
6054 else
6055 parent_eol_type = system_eol_type;
6056 if (EQ (parent_eol_type, Qunix))
6057 coding_system = AREF (eol_type, 0);
6058 else if (EQ (parent_eol_type, Qdos))
6059 coding_system = AREF (eol_type, 1);
6060 else if (EQ (parent_eol_type, Qmac))
6061 coding_system = AREF (eol_type, 2);
6063 return coding_system;
6067 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6068 decided for writing to a process. If not, complement them, and
6069 return a new coding system. */
6071 Lisp_Object
6072 complement_process_encoding_system (Lisp_Object coding_system)
6074 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6075 Lisp_Object spec, attrs;
6076 int i;
6078 for (i = 0; i < 3; i++)
6080 if (i == 1)
6081 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6082 else if (i == 2)
6083 coding_system = preferred_coding_system ();
6084 spec = CODING_SYSTEM_SPEC (coding_system);
6085 if (NILP (spec))
6086 continue;
6087 attrs = AREF (spec, 0);
6088 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6089 coding_base = CODING_ATTR_BASE_NAME (attrs);
6090 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6091 eol_base = coding_system;
6092 if (! NILP (coding_base) && ! NILP (eol_base))
6093 break;
6096 if (i > 0)
6097 /* The original CODING_SYSTEM didn't specify text-conversion or
6098 eol-conversion. Be sure that we return a fully complemented
6099 coding system. */
6100 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6101 return coding_system;
6105 /* Emacs has a mechanism to automatically detect a coding system if it
6106 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6107 it's impossible to distinguish some coding systems accurately
6108 because they use the same range of codes. So, at first, coding
6109 systems are categorized into 7, those are:
6111 o coding-category-emacs-mule
6113 The category for a coding system which has the same code range
6114 as Emacs' internal format. Assigned the coding-system (Lisp
6115 symbol) `emacs-mule' by default.
6117 o coding-category-sjis
6119 The category for a coding system which has the same code range
6120 as SJIS. Assigned the coding-system (Lisp
6121 symbol) `japanese-shift-jis' by default.
6123 o coding-category-iso-7
6125 The category for a coding system which has the same code range
6126 as ISO2022 of 7-bit environment. This doesn't use any locking
6127 shift and single shift functions. This can encode/decode all
6128 charsets. Assigned the coding-system (Lisp symbol)
6129 `iso-2022-7bit' by default.
6131 o coding-category-iso-7-tight
6133 Same as coding-category-iso-7 except that this can
6134 encode/decode only the specified charsets.
6136 o coding-category-iso-8-1
6138 The category for a coding system which has the same code range
6139 as ISO2022 of 8-bit environment and graphic plane 1 used only
6140 for DIMENSION1 charset. This doesn't use any locking shift
6141 and single shift functions. Assigned the coding-system (Lisp
6142 symbol) `iso-latin-1' by default.
6144 o coding-category-iso-8-2
6146 The category for a coding system which has the same code range
6147 as ISO2022 of 8-bit environment and graphic plane 1 used only
6148 for DIMENSION2 charset. This doesn't use any locking shift
6149 and single shift functions. Assigned the coding-system (Lisp
6150 symbol) `japanese-iso-8bit' by default.
6152 o coding-category-iso-7-else
6154 The category for a coding system which has the same code range
6155 as ISO2022 of 7-bit environment but uses locking shift or
6156 single shift functions. Assigned the coding-system (Lisp
6157 symbol) `iso-2022-7bit-lock' by default.
6159 o coding-category-iso-8-else
6161 The category for a coding system which has the same code range
6162 as ISO2022 of 8-bit environment but uses locking shift or
6163 single shift functions. Assigned the coding-system (Lisp
6164 symbol) `iso-2022-8bit-ss2' by default.
6166 o coding-category-big5
6168 The category for a coding system which has the same code range
6169 as BIG5. Assigned the coding-system (Lisp symbol)
6170 `cn-big5' by default.
6172 o coding-category-utf-8
6174 The category for a coding system which has the same code range
6175 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6176 symbol) `utf-8' by default.
6178 o coding-category-utf-16-be
6180 The category for a coding system in which a text has an
6181 Unicode signature (cf. Unicode Standard) in the order of BIG
6182 endian at the head. Assigned the coding-system (Lisp symbol)
6183 `utf-16-be' by default.
6185 o coding-category-utf-16-le
6187 The category for a coding system in which a text has an
6188 Unicode signature (cf. Unicode Standard) in the order of
6189 LITTLE endian at the head. Assigned the coding-system (Lisp
6190 symbol) `utf-16-le' by default.
6192 o coding-category-ccl
6194 The category for a coding system of which encoder/decoder is
6195 written in CCL programs. The default value is nil, i.e., no
6196 coding system is assigned.
6198 o coding-category-binary
6200 The category for a coding system not categorized in any of the
6201 above. Assigned the coding-system (Lisp symbol)
6202 `no-conversion' by default.
6204 Each of them is a Lisp symbol and the value is an actual
6205 `coding-system's (this is also a Lisp symbol) assigned by a user.
6206 What Emacs does actually is to detect a category of coding system.
6207 Then, it uses a `coding-system' assigned to it. If Emacs can't
6208 decide only one possible category, it selects a category of the
6209 highest priority. Priorities of categories are also specified by a
6210 user in a Lisp variable `coding-category-list'.
6214 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6215 int eol_seen);
6218 /* Return the number of ASCII characters at the head of the source.
6219 By side effects, set coding->head_ascii and update
6220 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6221 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6222 reliable only when all the source bytes are ASCII. */
6224 static ptrdiff_t
6225 check_ascii (struct coding_system *coding)
6227 const unsigned char *src, *end;
6228 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6229 int eol_seen = coding->eol_seen;
6231 coding_set_source (coding);
6232 src = coding->source;
6233 end = src + coding->src_bytes;
6235 if (inhibit_eol_conversion
6236 || SYMBOLP (eol_type))
6238 /* We don't have to check EOL format. */
6239 while (src < end && !( *src & 0x80))
6241 if (*src++ == '\n')
6242 eol_seen |= EOL_SEEN_LF;
6245 else
6247 end--; /* We look ahead one byte for "CR LF". */
6248 while (src < end)
6250 int c = *src;
6252 if (c & 0x80)
6253 break;
6254 src++;
6255 if (c == '\r')
6257 if (*src == '\n')
6259 eol_seen |= EOL_SEEN_CRLF;
6260 src++;
6262 else
6263 eol_seen |= EOL_SEEN_CR;
6265 else if (c == '\n')
6266 eol_seen |= EOL_SEEN_LF;
6268 if (src == end)
6270 int c = *src;
6272 /* All bytes but the last one C are ASCII. */
6273 if (! (c & 0x80))
6275 if (c == '\r')
6276 eol_seen |= EOL_SEEN_CR;
6277 else if (c == '\n')
6278 eol_seen |= EOL_SEEN_LF;
6279 src++;
6283 coding->head_ascii = src - coding->source;
6284 coding->eol_seen = eol_seen;
6285 return (coding->head_ascii);
6289 /* Return the number of characters at the source if all the bytes are
6290 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6291 effects, update coding->eol_seen. The value of coding->eol_seen is
6292 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6293 the value is reliable only when all the source bytes are valid
6294 UTF-8. */
6296 static ptrdiff_t
6297 check_utf_8 (struct coding_system *coding)
6299 const unsigned char *src, *end;
6300 int eol_seen;
6301 ptrdiff_t nchars = coding->head_ascii;
6303 if (coding->head_ascii < 0)
6304 check_ascii (coding);
6305 else
6306 coding_set_source (coding);
6307 src = coding->source + coding->head_ascii;
6308 /* We look ahead one byte for CR LF. */
6309 end = coding->source + coding->src_bytes - 1;
6310 eol_seen = coding->eol_seen;
6311 while (src < end)
6313 int c = *src;
6315 if (UTF_8_1_OCTET_P (*src))
6317 src++;
6318 if (c < 0x20)
6320 if (c == '\r')
6322 if (*src == '\n')
6324 eol_seen |= EOL_SEEN_CRLF;
6325 src++;
6326 nchars++;
6328 else
6329 eol_seen |= EOL_SEEN_CR;
6331 else if (c == '\n')
6332 eol_seen |= EOL_SEEN_LF;
6335 else if (UTF_8_2_OCTET_LEADING_P (c))
6337 if (c < 0xC2 /* overlong sequence */
6338 || src + 1 >= end
6339 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6340 return -1;
6341 src += 2;
6343 else if (UTF_8_3_OCTET_LEADING_P (c))
6345 if (src + 2 >= end
6346 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6347 && UTF_8_EXTRA_OCTET_P (src[2])))
6348 return -1;
6349 c = (((c & 0xF) << 12)
6350 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6351 if (c < 0x800 /* overlong sequence */
6352 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6353 return -1;
6354 src += 3;
6356 else if (UTF_8_4_OCTET_LEADING_P (c))
6358 if (src + 3 >= end
6359 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6360 && UTF_8_EXTRA_OCTET_P (src[2])
6361 && UTF_8_EXTRA_OCTET_P (src[3])))
6362 return -1;
6363 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6364 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6365 if (c < 0x10000 /* overlong sequence */
6366 || c >= 0x110000) /* non-Unicode character */
6367 return -1;
6368 src += 4;
6370 else
6371 return -1;
6372 nchars++;
6375 if (src == end)
6377 if (! UTF_8_1_OCTET_P (*src))
6378 return -1;
6379 nchars++;
6380 if (*src == '\r')
6381 eol_seen |= EOL_SEEN_CR;
6382 else if (*src == '\n')
6383 eol_seen |= EOL_SEEN_LF;
6385 coding->eol_seen = eol_seen;
6386 return nchars;
6390 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6391 SOURCE is encoded. If CATEGORY is one of
6392 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6393 two-byte, else they are encoded by one-byte.
6395 Return one of EOL_SEEN_XXX. */
6397 #define MAX_EOL_CHECK_COUNT 3
6399 static int
6400 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6401 enum coding_category category)
6403 const unsigned char *src = source, *src_end = src + src_bytes;
6404 unsigned char c;
6405 int total = 0;
6406 int eol_seen = EOL_SEEN_NONE;
6408 if ((1 << category) & CATEGORY_MASK_UTF_16)
6410 bool msb = category == (coding_category_utf_16_le
6411 | coding_category_utf_16_le_nosig);
6412 bool lsb = !msb;
6414 while (src + 1 < src_end)
6416 c = src[lsb];
6417 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6419 int this_eol;
6421 if (c == '\n')
6422 this_eol = EOL_SEEN_LF;
6423 else if (src + 3 >= src_end
6424 || src[msb + 2] != 0
6425 || src[lsb + 2] != '\n')
6426 this_eol = EOL_SEEN_CR;
6427 else
6429 this_eol = EOL_SEEN_CRLF;
6430 src += 2;
6433 if (eol_seen == EOL_SEEN_NONE)
6434 /* This is the first end-of-line. */
6435 eol_seen = this_eol;
6436 else if (eol_seen != this_eol)
6438 /* The found type is different from what found before.
6439 Allow for stray ^M characters in DOS EOL files. */
6440 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6441 || (eol_seen == EOL_SEEN_CRLF
6442 && this_eol == EOL_SEEN_CR))
6443 eol_seen = EOL_SEEN_CRLF;
6444 else
6446 eol_seen = EOL_SEEN_LF;
6447 break;
6450 if (++total == MAX_EOL_CHECK_COUNT)
6451 break;
6453 src += 2;
6456 else
6457 while (src < src_end)
6459 c = *src++;
6460 if (c == '\n' || c == '\r')
6462 int this_eol;
6464 if (c == '\n')
6465 this_eol = EOL_SEEN_LF;
6466 else if (src >= src_end || *src != '\n')
6467 this_eol = EOL_SEEN_CR;
6468 else
6469 this_eol = EOL_SEEN_CRLF, src++;
6471 if (eol_seen == EOL_SEEN_NONE)
6472 /* This is the first end-of-line. */
6473 eol_seen = this_eol;
6474 else if (eol_seen != this_eol)
6476 /* The found type is different from what found before.
6477 Allow for stray ^M characters in DOS EOL files. */
6478 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6479 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6480 eol_seen = EOL_SEEN_CRLF;
6481 else
6483 eol_seen = EOL_SEEN_LF;
6484 break;
6487 if (++total == MAX_EOL_CHECK_COUNT)
6488 break;
6491 return eol_seen;
6495 static Lisp_Object
6496 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6498 Lisp_Object eol_type;
6500 eol_type = CODING_ID_EOL_TYPE (coding->id);
6501 if (! VECTORP (eol_type))
6502 /* Already adjusted. */
6503 return eol_type;
6504 if (eol_seen & EOL_SEEN_LF)
6506 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6507 eol_type = Qunix;
6509 else if (eol_seen & EOL_SEEN_CRLF)
6511 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6512 eol_type = Qdos;
6514 else if (eol_seen & EOL_SEEN_CR)
6516 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6517 eol_type = Qmac;
6519 return eol_type;
6522 /* Detect how a text specified in CODING is encoded. If a coding
6523 system is detected, update fields of CODING by the detected coding
6524 system. */
6526 static void
6527 detect_coding (struct coding_system *coding)
6529 const unsigned char *src, *src_end;
6530 unsigned int saved_mode = coding->mode;
6531 Lisp_Object found = Qnil;
6532 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6534 coding->consumed = coding->consumed_char = 0;
6535 coding->produced = coding->produced_char = 0;
6536 coding_set_source (coding);
6538 src_end = coding->source + coding->src_bytes;
6540 coding->eol_seen = EOL_SEEN_NONE;
6541 /* If we have not yet decided the text encoding type, detect it
6542 now. */
6543 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6545 int c, i;
6546 struct coding_detection_info detect_info;
6547 bool null_byte_found = 0, eight_bit_found = 0;
6548 bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6549 inhibit_null_byte_detection);
6550 bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6551 inhibit_iso_escape_detection);
6552 bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6554 coding->head_ascii = 0;
6555 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6556 for (src = coding->source; src < src_end; src++)
6558 c = *src;
6559 if (c & 0x80)
6561 eight_bit_found = 1;
6562 if (null_byte_found)
6563 break;
6565 else if (c < 0x20)
6567 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6568 && ! inhibit_ied
6569 && ! detect_info.checked)
6571 if (detect_coding_iso_2022 (coding, &detect_info))
6573 /* We have scanned the whole data. */
6574 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6576 /* We didn't find an 8-bit code. We may
6577 have found a null-byte, but it's very
6578 rare that a binary file conforms to
6579 ISO-2022. */
6580 src = src_end;
6581 coding->head_ascii = src - coding->source;
6583 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6584 break;
6587 else if (! c && !inhibit_nbd)
6589 null_byte_found = 1;
6590 if (eight_bit_found)
6591 break;
6593 else if (! disable_ascii_optimization
6594 && ! inhibit_eol_conversion)
6596 if (c == '\r')
6598 if (src < src_end && src[1] == '\n')
6600 coding->eol_seen |= EOL_SEEN_CRLF;
6601 src++;
6602 if (! eight_bit_found)
6603 coding->head_ascii++;
6605 else
6606 coding->eol_seen |= EOL_SEEN_CR;
6608 else if (c == '\n')
6610 coding->eol_seen |= EOL_SEEN_LF;
6614 if (! eight_bit_found)
6615 coding->head_ascii++;
6617 else if (! eight_bit_found)
6618 coding->head_ascii++;
6621 if (null_byte_found || eight_bit_found
6622 || coding->head_ascii < coding->src_bytes
6623 || detect_info.found)
6625 enum coding_category category;
6626 struct coding_system *this;
6628 if (coding->head_ascii == coding->src_bytes)
6629 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6630 for (i = 0; i < coding_category_raw_text; i++)
6632 category = coding_priorities[i];
6633 this = coding_categories + category;
6634 if (detect_info.found & (1 << category))
6635 break;
6637 else
6639 if (null_byte_found)
6641 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6642 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6644 else if (prefer_utf_8
6645 && detect_coding_utf_8 (coding, &detect_info))
6647 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6648 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6650 for (i = 0; i < coding_category_raw_text; i++)
6652 category = coding_priorities[i];
6653 this = coding_categories + category;
6654 /* Some of this->detector (e.g. detect_coding_sjis)
6655 require this information. */
6656 coding->id = this->id;
6657 if (this->id < 0)
6659 /* No coding system of this category is defined. */
6660 detect_info.rejected |= (1 << category);
6662 else if (category >= coding_category_raw_text)
6663 continue;
6664 else if (detect_info.checked & (1 << category))
6666 if (detect_info.found & (1 << category))
6667 break;
6669 else if ((*(this->detector)) (coding, &detect_info)
6670 && detect_info.found & (1 << category))
6671 break;
6675 if (i < coding_category_raw_text)
6677 if (category == coding_category_utf_8_auto)
6679 Lisp_Object coding_systems;
6681 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6682 coding_attr_utf_bom);
6683 if (CONSP (coding_systems))
6685 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6686 found = XCAR (coding_systems);
6687 else
6688 found = XCDR (coding_systems);
6690 else
6691 found = CODING_ID_NAME (this->id);
6693 else if (category == coding_category_utf_16_auto)
6695 Lisp_Object coding_systems;
6697 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6698 coding_attr_utf_bom);
6699 if (CONSP (coding_systems))
6701 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6702 found = XCAR (coding_systems);
6703 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6704 found = XCDR (coding_systems);
6706 else
6707 found = CODING_ID_NAME (this->id);
6709 else
6710 found = CODING_ID_NAME (this->id);
6712 else if (null_byte_found)
6713 found = Qno_conversion;
6714 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6715 == CATEGORY_MASK_ANY)
6716 found = Qraw_text;
6717 else if (detect_info.rejected)
6718 for (i = 0; i < coding_category_raw_text; i++)
6719 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6721 this = coding_categories + coding_priorities[i];
6722 found = CODING_ID_NAME (this->id);
6723 break;
6727 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6728 == coding_category_utf_8_auto)
6730 Lisp_Object coding_systems;
6731 struct coding_detection_info detect_info;
6733 coding_systems
6734 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6735 detect_info.found = detect_info.rejected = 0;
6736 if (check_ascii (coding) == coding->src_bytes)
6738 if (CONSP (coding_systems))
6739 found = XCDR (coding_systems);
6741 else
6743 if (CONSP (coding_systems)
6744 && detect_coding_utf_8 (coding, &detect_info))
6746 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6747 found = XCAR (coding_systems);
6748 else
6749 found = XCDR (coding_systems);
6753 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6754 == coding_category_utf_16_auto)
6756 Lisp_Object coding_systems;
6757 struct coding_detection_info detect_info;
6759 coding_systems
6760 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6761 detect_info.found = detect_info.rejected = 0;
6762 coding->head_ascii = 0;
6763 if (CONSP (coding_systems)
6764 && detect_coding_utf_16 (coding, &detect_info))
6766 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6767 found = XCAR (coding_systems);
6768 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6769 found = XCDR (coding_systems);
6773 if (! NILP (found))
6775 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6776 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6777 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6778 : EOL_SEEN_LF);
6780 setup_coding_system (found, coding);
6781 if (specified_eol != EOL_SEEN_NONE)
6782 adjust_coding_eol_type (coding, specified_eol);
6785 coding->mode = saved_mode;
6789 static void
6790 decode_eol (struct coding_system *coding)
6792 Lisp_Object eol_type;
6793 unsigned char *p, *pbeg, *pend;
6795 eol_type = CODING_ID_EOL_TYPE (coding->id);
6796 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6797 return;
6799 if (NILP (coding->dst_object))
6800 pbeg = coding->destination;
6801 else
6802 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6803 pend = pbeg + coding->produced;
6805 if (VECTORP (eol_type))
6807 int eol_seen = EOL_SEEN_NONE;
6809 for (p = pbeg; p < pend; p++)
6811 if (*p == '\n')
6812 eol_seen |= EOL_SEEN_LF;
6813 else if (*p == '\r')
6815 if (p + 1 < pend && *(p + 1) == '\n')
6817 eol_seen |= EOL_SEEN_CRLF;
6818 p++;
6820 else
6821 eol_seen |= EOL_SEEN_CR;
6824 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6825 if ((eol_seen & EOL_SEEN_CRLF) != 0
6826 && (eol_seen & EOL_SEEN_CR) != 0
6827 && (eol_seen & EOL_SEEN_LF) == 0)
6828 eol_seen = EOL_SEEN_CRLF;
6829 else if (eol_seen != EOL_SEEN_NONE
6830 && eol_seen != EOL_SEEN_LF
6831 && eol_seen != EOL_SEEN_CRLF
6832 && eol_seen != EOL_SEEN_CR)
6833 eol_seen = EOL_SEEN_LF;
6834 if (eol_seen != EOL_SEEN_NONE)
6835 eol_type = adjust_coding_eol_type (coding, eol_seen);
6838 if (EQ (eol_type, Qmac))
6840 for (p = pbeg; p < pend; p++)
6841 if (*p == '\r')
6842 *p = '\n';
6844 else if (EQ (eol_type, Qdos))
6846 ptrdiff_t n = 0;
6848 if (NILP (coding->dst_object))
6850 /* Start deleting '\r' from the tail to minimize the memory
6851 movement. */
6852 for (p = pend - 2; p >= pbeg; p--)
6853 if (*p == '\r')
6855 memmove (p, p + 1, pend-- - p - 1);
6856 n++;
6859 else
6861 ptrdiff_t pos_byte = coding->dst_pos_byte;
6862 ptrdiff_t pos = coding->dst_pos;
6863 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6865 while (pos < pos_end)
6867 p = BYTE_POS_ADDR (pos_byte);
6868 if (*p == '\r' && p[1] == '\n')
6870 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6871 n++;
6872 pos_end--;
6874 pos++;
6875 if (coding->dst_multibyte)
6876 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6877 else
6878 pos_byte++;
6881 coding->produced -= n;
6882 coding->produced_char -= n;
6887 /* MAX_LOOKUP's maximum value. MAX_LOOKUP is an int and so cannot
6888 exceed INT_MAX. Also, MAX_LOOKUP is multiplied by sizeof (int) for
6889 alloca, so it cannot exceed MAX_ALLOCA / sizeof (int). */
6890 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6892 /* Return a translation table (or list of them) from coding system
6893 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6894 not ENCODEP). */
6896 static Lisp_Object
6897 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6899 Lisp_Object standard, translation_table;
6900 Lisp_Object val;
6902 if (NILP (Venable_character_translation))
6904 if (max_lookup)
6905 *max_lookup = 0;
6906 return Qnil;
6908 if (encodep)
6909 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6910 standard = Vstandard_translation_table_for_encode;
6911 else
6912 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6913 standard = Vstandard_translation_table_for_decode;
6914 if (NILP (translation_table))
6915 translation_table = standard;
6916 else
6918 if (SYMBOLP (translation_table))
6919 translation_table = Fget (translation_table, Qtranslation_table);
6920 else if (CONSP (translation_table))
6922 translation_table = Fcopy_sequence (translation_table);
6923 for (val = translation_table; CONSP (val); val = XCDR (val))
6924 if (SYMBOLP (XCAR (val)))
6925 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6927 if (CHAR_TABLE_P (standard))
6929 if (CONSP (translation_table))
6930 translation_table = nconc2 (translation_table, list1 (standard));
6931 else
6932 translation_table = list2 (translation_table, standard);
6936 if (max_lookup)
6938 *max_lookup = 1;
6939 if (CHAR_TABLE_P (translation_table)
6940 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6942 val = XCHAR_TABLE (translation_table)->extras[1];
6943 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6944 *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6946 else if (CONSP (translation_table))
6948 Lisp_Object tail;
6950 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6951 if (CHAR_TABLE_P (XCAR (tail))
6952 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6954 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6955 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6956 *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6960 return translation_table;
6963 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6964 do { \
6965 trans = Qnil; \
6966 if (CHAR_TABLE_P (table)) \
6968 trans = CHAR_TABLE_REF (table, c); \
6969 if (CHARACTERP (trans)) \
6970 c = XFASTINT (trans), trans = Qnil; \
6972 else if (CONSP (table)) \
6974 Lisp_Object tail; \
6976 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6977 if (CHAR_TABLE_P (XCAR (tail))) \
6979 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6980 if (CHARACTERP (trans)) \
6981 c = XFASTINT (trans), trans = Qnil; \
6982 else if (! NILP (trans)) \
6983 break; \
6986 } while (0)
6989 /* Return a translation of character(s) at BUF according to TRANS.
6990 TRANS is TO-CHAR or ((FROM . TO) ...) where
6991 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6992 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6993 translation is found, and Qnil if not found..
6994 If BUF is too short to lookup characters in FROM, return Qt. */
6996 static Lisp_Object
6997 get_translation (Lisp_Object trans, int *buf, int *buf_end)
7000 if (INTEGERP (trans))
7001 return trans;
7002 for (; CONSP (trans); trans = XCDR (trans))
7004 Lisp_Object val = XCAR (trans);
7005 Lisp_Object from = XCAR (val);
7006 ptrdiff_t len = ASIZE (from);
7007 ptrdiff_t i;
7009 for (i = 0; i < len; i++)
7011 if (buf + i == buf_end)
7012 return Qt;
7013 if (XINT (AREF (from, i)) != buf[i])
7014 break;
7016 if (i == len)
7017 return val;
7019 return Qnil;
7023 static int
7024 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7025 bool last_block)
7027 unsigned char *dst = coding->destination + coding->produced;
7028 unsigned char *dst_end = coding->destination + coding->dst_bytes;
7029 ptrdiff_t produced;
7030 ptrdiff_t produced_chars = 0;
7031 int carryover = 0;
7033 if (! coding->chars_at_source)
7035 /* Source characters are in coding->charbuf. */
7036 int *buf = coding->charbuf;
7037 int *buf_end = buf + coding->charbuf_used;
7039 if (EQ (coding->src_object, coding->dst_object)
7040 && ! NILP (coding->dst_object))
7042 eassert (growable_destination (coding));
7043 coding_set_source (coding);
7044 dst_end = ((unsigned char *) coding->source) + coding->consumed;
7047 while (buf < buf_end)
7049 int c = *buf;
7050 ptrdiff_t i;
7052 if (c >= 0)
7054 ptrdiff_t from_nchars = 1, to_nchars = 1;
7055 Lisp_Object trans = Qnil;
7057 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7058 if (! NILP (trans))
7060 trans = get_translation (trans, buf, buf_end);
7061 if (INTEGERP (trans))
7062 c = XINT (trans);
7063 else if (CONSP (trans))
7065 from_nchars = ASIZE (XCAR (trans));
7066 trans = XCDR (trans);
7067 if (INTEGERP (trans))
7068 c = XINT (trans);
7069 else
7071 to_nchars = ASIZE (trans);
7072 c = XINT (AREF (trans, 0));
7075 else if (EQ (trans, Qt) && ! last_block)
7076 break;
7079 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7081 eassert (growable_destination (coding));
7082 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7083 / MAX_MULTIBYTE_LENGTH)
7084 < to_nchars)
7085 memory_full (SIZE_MAX);
7086 dst = alloc_destination (coding,
7087 buf_end - buf
7088 + MAX_MULTIBYTE_LENGTH * to_nchars,
7089 dst);
7090 if (EQ (coding->src_object, coding->dst_object))
7092 coding_set_source (coding);
7093 dst_end = (((unsigned char *) coding->source)
7094 + coding->consumed);
7096 else
7097 dst_end = coding->destination + coding->dst_bytes;
7100 for (i = 0; i < to_nchars; i++)
7102 if (i > 0)
7103 c = XINT (AREF (trans, i));
7104 if (coding->dst_multibyte
7105 || ! CHAR_BYTE8_P (c))
7106 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7107 else
7108 *dst++ = CHAR_TO_BYTE8 (c);
7110 produced_chars += to_nchars;
7111 buf += from_nchars;
7113 else
7114 /* This is an annotation datum. (-C) is the length. */
7115 buf += -c;
7117 carryover = buf_end - buf;
7119 else
7121 /* Source characters are at coding->source. */
7122 const unsigned char *src = coding->source;
7123 const unsigned char *src_end = src + coding->consumed;
7125 if (EQ (coding->dst_object, coding->src_object))
7127 eassert (growable_destination (coding));
7128 dst_end = (unsigned char *) src;
7130 if (coding->src_multibyte != coding->dst_multibyte)
7132 if (coding->src_multibyte)
7134 bool multibytep = 1;
7135 ptrdiff_t consumed_chars = 0;
7137 while (1)
7139 const unsigned char *src_base = src;
7140 int c;
7142 ONE_MORE_BYTE (c);
7143 if (dst == dst_end)
7145 eassert (growable_destination (coding));
7146 if (EQ (coding->src_object, coding->dst_object))
7147 dst_end = (unsigned char *) src;
7148 if (dst == dst_end)
7150 ptrdiff_t offset = src - coding->source;
7152 dst = alloc_destination (coding, src_end - src + 1,
7153 dst);
7154 dst_end = coding->destination + coding->dst_bytes;
7155 coding_set_source (coding);
7156 src = coding->source + offset;
7157 src_end = coding->source + coding->consumed;
7158 if (EQ (coding->src_object, coding->dst_object))
7159 dst_end = (unsigned char *) src;
7162 *dst++ = c;
7163 produced_chars++;
7165 no_more_source:
7168 else
7169 while (src < src_end)
7171 bool multibytep = 1;
7172 int c = *src++;
7174 if (dst >= dst_end - 1)
7176 eassert (growable_destination (coding));
7177 if (EQ (coding->src_object, coding->dst_object))
7178 dst_end = (unsigned char *) src;
7179 if (dst >= dst_end - 1)
7181 ptrdiff_t offset = src - coding->source;
7182 ptrdiff_t more_bytes;
7184 if (EQ (coding->src_object, coding->dst_object))
7185 more_bytes = ((src_end - src) / 2) + 2;
7186 else
7187 more_bytes = src_end - src + 2;
7188 dst = alloc_destination (coding, more_bytes, dst);
7189 dst_end = coding->destination + coding->dst_bytes;
7190 coding_set_source (coding);
7191 src = coding->source + offset;
7192 src_end = coding->source + coding->consumed;
7193 if (EQ (coding->src_object, coding->dst_object))
7194 dst_end = (unsigned char *) src;
7197 EMIT_ONE_BYTE (c);
7200 else
7202 if (!EQ (coding->src_object, coding->dst_object))
7204 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7206 if (require > 0)
7208 ptrdiff_t offset = src - coding->source;
7210 dst = alloc_destination (coding, require, dst);
7211 coding_set_source (coding);
7212 src = coding->source + offset;
7213 src_end = coding->source + coding->consumed;
7216 produced_chars = coding->consumed_char;
7217 while (src < src_end)
7218 *dst++ = *src++;
7222 produced = dst - (coding->destination + coding->produced);
7223 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7224 insert_from_gap (produced_chars, produced, 0);
7225 coding->produced += produced;
7226 coding->produced_char += produced_chars;
7227 return carryover;
7230 /* Compose text in CODING->object according to the annotation data at
7231 CHARBUF. CHARBUF is an array:
7232 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7235 static void
7236 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7238 int len;
7239 ptrdiff_t to;
7240 enum composition_method method;
7241 Lisp_Object components;
7243 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7244 to = pos + charbuf[2];
7245 method = (enum composition_method) (charbuf[4]);
7247 if (method == COMPOSITION_RELATIVE)
7248 components = Qnil;
7249 else
7251 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7252 int i, j;
7254 if (method == COMPOSITION_WITH_RULE)
7255 len = charbuf[2] * 3 - 2;
7256 charbuf += MAX_ANNOTATION_LENGTH;
7257 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7258 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7260 if (charbuf[i] >= 0)
7261 args[j] = make_number (charbuf[i]);
7262 else
7264 i++;
7265 args[j] = make_number (charbuf[i] % 0x100);
7268 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7270 compose_text (pos, to, components, Qnil, coding->dst_object);
7274 /* Put `charset' property on text in CODING->object according to
7275 the annotation data at CHARBUF. CHARBUF is an array:
7276 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7279 static void
7280 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7282 ptrdiff_t from = pos - charbuf[2];
7283 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7285 Fput_text_property (make_number (from), make_number (pos),
7286 Qcharset, CHARSET_NAME (charset),
7287 coding->dst_object);
7290 #define MAX_CHARBUF_SIZE 0x4000
7291 /* How many units decoding functions expect in coding->charbuf at
7292 most. Currently, decode_coding_emacs_mule expects the following
7293 size, and that is the largest value. */
7294 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7296 #define ALLOC_CONVERSION_WORK_AREA(coding, size) \
7297 do { \
7298 ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE, \
7299 MAX_CHARBUF_SIZE); \
7300 coding->charbuf = SAFE_ALLOCA (units * sizeof (int)); \
7301 coding->charbuf_size = units; \
7302 } while (0)
7304 static void
7305 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7307 int *charbuf = coding->charbuf;
7308 int *charbuf_end = charbuf + coding->charbuf_used;
7310 if (NILP (coding->dst_object))
7311 return;
7313 while (charbuf < charbuf_end)
7315 if (*charbuf >= 0)
7316 pos++, charbuf++;
7317 else
7319 int len = -*charbuf;
7321 if (len > 2)
7322 switch (charbuf[1])
7324 case CODING_ANNOTATE_COMPOSITION_MASK:
7325 produce_composition (coding, charbuf, pos);
7326 break;
7327 case CODING_ANNOTATE_CHARSET_MASK:
7328 produce_charset (coding, charbuf, pos);
7329 break;
7331 charbuf += len;
7336 /* Decode the data at CODING->src_object into CODING->dst_object.
7337 CODING->src_object is a buffer, a string, or nil.
7338 CODING->dst_object is a buffer.
7340 If CODING->src_object is a buffer, it must be the current buffer.
7341 In this case, if CODING->src_pos is positive, it is a position of
7342 the source text in the buffer, otherwise, the source text is in the
7343 gap area of the buffer, and CODING->src_pos specifies the offset of
7344 the text from GPT (which must be the same as PT). If this is the
7345 same buffer as CODING->dst_object, CODING->src_pos must be
7346 negative.
7348 If CODING->src_object is a string, CODING->src_pos is an index to
7349 that string.
7351 If CODING->src_object is nil, CODING->source must already point to
7352 the non-relocatable memory area. In this case, CODING->src_pos is
7353 an offset from CODING->source.
7355 The decoded data is inserted at the current point of the buffer
7356 CODING->dst_object.
7359 static void
7360 decode_coding (struct coding_system *coding)
7362 Lisp_Object attrs;
7363 Lisp_Object undo_list;
7364 Lisp_Object translation_table;
7365 struct ccl_spec cclspec;
7366 int carryover;
7367 int i;
7369 USE_SAFE_ALLOCA;
7371 if (BUFFERP (coding->src_object)
7372 && coding->src_pos > 0
7373 && coding->src_pos < GPT
7374 && coding->src_pos + coding->src_chars > GPT)
7375 move_gap_both (coding->src_pos, coding->src_pos_byte);
7377 undo_list = Qt;
7378 if (BUFFERP (coding->dst_object))
7380 set_buffer_internal (XBUFFER (coding->dst_object));
7381 if (GPT != PT)
7382 move_gap_both (PT, PT_BYTE);
7384 /* We must disable undo_list in order to record the whole insert
7385 transaction via record_insert at the end. But doing so also
7386 disables the recording of the first change to the undo_list.
7387 Therefore we check for first change here and record it via
7388 record_first_change if needed. */
7389 if (MODIFF <= SAVE_MODIFF)
7390 record_first_change ();
7392 undo_list = BVAR (current_buffer, undo_list);
7393 bset_undo_list (current_buffer, Qt);
7396 coding->consumed = coding->consumed_char = 0;
7397 coding->produced = coding->produced_char = 0;
7398 coding->chars_at_source = 0;
7399 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7401 ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7403 attrs = CODING_ID_ATTRS (coding->id);
7404 translation_table = get_translation_table (attrs, 0, NULL);
7406 carryover = 0;
7407 if (coding->decoder == decode_coding_ccl)
7409 coding->spec.ccl = &cclspec;
7410 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7414 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7416 coding_set_source (coding);
7417 coding->annotated = 0;
7418 coding->charbuf_used = carryover;
7419 (*(coding->decoder)) (coding);
7420 coding_set_destination (coding);
7421 carryover = produce_chars (coding, translation_table, 0);
7422 if (coding->annotated)
7423 produce_annotation (coding, pos);
7424 for (i = 0; i < carryover; i++)
7425 coding->charbuf[i]
7426 = coding->charbuf[coding->charbuf_used - carryover + i];
7428 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7429 || (coding->consumed < coding->src_bytes
7430 && (coding->result == CODING_RESULT_SUCCESS
7431 || coding->result == CODING_RESULT_INVALID_SRC)));
7433 if (carryover > 0)
7435 coding_set_destination (coding);
7436 coding->charbuf_used = carryover;
7437 produce_chars (coding, translation_table, 1);
7440 coding->carryover_bytes = 0;
7441 if (coding->consumed < coding->src_bytes)
7443 ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7444 const unsigned char *src;
7446 coding_set_source (coding);
7447 coding_set_destination (coding);
7448 src = coding->source + coding->consumed;
7450 if (coding->mode & CODING_MODE_LAST_BLOCK)
7452 /* Flush out unprocessed data as binary chars. We are sure
7453 that the number of data is less than the size of
7454 coding->charbuf. */
7455 coding->charbuf_used = 0;
7456 coding->chars_at_source = 0;
7458 while (nbytes-- > 0)
7460 int c = *src++;
7462 if (c & 0x80)
7463 c = BYTE8_TO_CHAR (c);
7464 coding->charbuf[coding->charbuf_used++] = c;
7466 produce_chars (coding, Qnil, 1);
7468 else
7470 /* Record unprocessed bytes in coding->carryover. We are
7471 sure that the number of data is less than the size of
7472 coding->carryover. */
7473 unsigned char *p = coding->carryover;
7475 if (nbytes > sizeof coding->carryover)
7476 nbytes = sizeof coding->carryover;
7477 coding->carryover_bytes = nbytes;
7478 while (nbytes-- > 0)
7479 *p++ = *src++;
7481 coding->consumed = coding->src_bytes;
7484 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7485 && !inhibit_eol_conversion)
7486 decode_eol (coding);
7487 if (BUFFERP (coding->dst_object))
7489 bset_undo_list (current_buffer, undo_list);
7490 record_insert (coding->dst_pos, coding->produced_char);
7493 SAFE_FREE ();
7497 /* Extract an annotation datum from a composition starting at POS and
7498 ending before LIMIT of CODING->src_object (buffer or string), store
7499 the data in BUF, set *STOP to a starting position of the next
7500 composition (if any) or to LIMIT, and return the address of the
7501 next element of BUF.
7503 If such an annotation is not found, set *STOP to a starting
7504 position of a composition after POS (if any) or to LIMIT, and
7505 return BUF. */
7507 static int *
7508 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7509 struct coding_system *coding, int *buf,
7510 ptrdiff_t *stop)
7512 ptrdiff_t start, end;
7513 Lisp_Object prop;
7515 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7516 || end > limit)
7517 *stop = limit;
7518 else if (start > pos)
7519 *stop = start;
7520 else
7522 if (start == pos)
7524 /* We found a composition. Store the corresponding
7525 annotation data in BUF. */
7526 int *head = buf;
7527 enum composition_method method = composition_method (prop);
7528 int nchars = COMPOSITION_LENGTH (prop);
7530 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7531 if (method != COMPOSITION_RELATIVE)
7533 Lisp_Object components;
7534 ptrdiff_t i, len, i_byte;
7536 components = COMPOSITION_COMPONENTS (prop);
7537 if (VECTORP (components))
7539 len = ASIZE (components);
7540 for (i = 0; i < len; i++)
7541 *buf++ = XINT (AREF (components, i));
7543 else if (STRINGP (components))
7545 len = SCHARS (components);
7546 i = i_byte = 0;
7547 while (i < len)
7549 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7550 buf++;
7553 else if (INTEGERP (components))
7555 len = 1;
7556 *buf++ = XINT (components);
7558 else if (CONSP (components))
7560 for (len = 0; CONSP (components);
7561 len++, components = XCDR (components))
7562 *buf++ = XINT (XCAR (components));
7564 else
7565 emacs_abort ();
7566 *head -= len;
7570 if (find_composition (end, limit, &start, &end, &prop,
7571 coding->src_object)
7572 && end <= limit)
7573 *stop = start;
7574 else
7575 *stop = limit;
7577 return buf;
7581 /* Extract an annotation datum from a text property `charset' at POS of
7582 CODING->src_object (buffer of string), store the data in BUF, set
7583 *STOP to the position where the value of `charset' property changes
7584 (limiting by LIMIT), and return the address of the next element of
7585 BUF.
7587 If the property value is nil, set *STOP to the position where the
7588 property value is non-nil (limiting by LIMIT), and return BUF. */
7590 static int *
7591 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7592 struct coding_system *coding, int *buf,
7593 ptrdiff_t *stop)
7595 Lisp_Object val, next;
7596 int id;
7598 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7599 if (! NILP (val) && CHARSETP (val))
7600 id = XINT (CHARSET_SYMBOL_ID (val));
7601 else
7602 id = -1;
7603 ADD_CHARSET_DATA (buf, 0, id);
7604 next = Fnext_single_property_change (make_number (pos), Qcharset,
7605 coding->src_object,
7606 make_number (limit));
7607 *stop = XINT (next);
7608 return buf;
7612 static void
7613 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7614 int max_lookup)
7616 int *buf = coding->charbuf;
7617 int *buf_end = coding->charbuf + coding->charbuf_size;
7618 const unsigned char *src = coding->source + coding->consumed;
7619 const unsigned char *src_end = coding->source + coding->src_bytes;
7620 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7621 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7622 bool multibytep = coding->src_multibyte;
7623 Lisp_Object eol_type;
7624 int c;
7625 ptrdiff_t stop, stop_composition, stop_charset;
7626 int *lookup_buf = NULL;
7628 if (! NILP (translation_table))
7629 lookup_buf = alloca (sizeof (int) * max_lookup);
7631 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7632 if (VECTORP (eol_type))
7633 eol_type = Qunix;
7635 /* Note: composition handling is not yet implemented. */
7636 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7638 if (NILP (coding->src_object))
7639 stop = stop_composition = stop_charset = end_pos;
7640 else
7642 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7643 stop = stop_composition = pos;
7644 else
7645 stop = stop_composition = end_pos;
7646 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7647 stop = stop_charset = pos;
7648 else
7649 stop_charset = end_pos;
7652 /* Compensate for CRLF and conversion. */
7653 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7654 while (buf < buf_end)
7656 Lisp_Object trans;
7658 if (pos == stop)
7660 if (pos == end_pos)
7661 break;
7662 if (pos == stop_composition)
7663 buf = handle_composition_annotation (pos, end_pos, coding,
7664 buf, &stop_composition);
7665 if (pos == stop_charset)
7666 buf = handle_charset_annotation (pos, end_pos, coding,
7667 buf, &stop_charset);
7668 stop = (stop_composition < stop_charset
7669 ? stop_composition : stop_charset);
7672 if (! multibytep)
7674 int bytes;
7676 if (coding->encoder == encode_coding_raw_text
7677 || coding->encoder == encode_coding_ccl)
7678 c = *src++, pos++;
7679 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7680 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7681 else
7682 c = BYTE8_TO_CHAR (*src), src++, pos++;
7684 else
7685 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7686 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7687 c = '\n';
7688 if (! EQ (eol_type, Qunix))
7690 if (c == '\n')
7692 if (EQ (eol_type, Qdos))
7693 *buf++ = '\r';
7694 else
7695 c = '\r';
7699 trans = Qnil;
7700 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7701 if (NILP (trans))
7702 *buf++ = c;
7703 else
7705 ptrdiff_t from_nchars = 1, to_nchars = 1;
7706 int *lookup_buf_end;
7707 const unsigned char *p = src;
7708 int i;
7710 lookup_buf[0] = c;
7711 for (i = 1; i < max_lookup && p < src_end; i++)
7712 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7713 lookup_buf_end = lookup_buf + i;
7714 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7715 if (INTEGERP (trans))
7716 c = XINT (trans);
7717 else if (CONSP (trans))
7719 from_nchars = ASIZE (XCAR (trans));
7720 trans = XCDR (trans);
7721 if (INTEGERP (trans))
7722 c = XINT (trans);
7723 else
7725 to_nchars = ASIZE (trans);
7726 if (buf_end - buf < to_nchars)
7727 break;
7728 c = XINT (AREF (trans, 0));
7731 else
7732 break;
7733 *buf++ = c;
7734 for (i = 1; i < to_nchars; i++)
7735 *buf++ = XINT (AREF (trans, i));
7736 for (i = 1; i < from_nchars; i++, pos++)
7737 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7741 coding->consumed = src - coding->source;
7742 coding->consumed_char = pos - coding->src_pos;
7743 coding->charbuf_used = buf - coding->charbuf;
7744 coding->chars_at_source = 0;
7748 /* Encode the text at CODING->src_object into CODING->dst_object.
7749 CODING->src_object is a buffer or a string.
7750 CODING->dst_object is a buffer or nil.
7752 If CODING->src_object is a buffer, it must be the current buffer.
7753 In this case, if CODING->src_pos is positive, it is a position of
7754 the source text in the buffer, otherwise. the source text is in the
7755 gap area of the buffer, and coding->src_pos specifies the offset of
7756 the text from GPT (which must be the same as PT). If this is the
7757 same buffer as CODING->dst_object, CODING->src_pos must be
7758 negative and CODING should not have `pre-write-conversion'.
7760 If CODING->src_object is a string, CODING should not have
7761 `pre-write-conversion'.
7763 If CODING->dst_object is a buffer, the encoded data is inserted at
7764 the current point of that buffer.
7766 If CODING->dst_object is nil, the encoded data is placed at the
7767 memory area specified by CODING->destination. */
7769 static void
7770 encode_coding (struct coding_system *coding)
7772 Lisp_Object attrs;
7773 Lisp_Object translation_table;
7774 int max_lookup;
7775 struct ccl_spec cclspec;
7777 USE_SAFE_ALLOCA;
7779 attrs = CODING_ID_ATTRS (coding->id);
7780 if (coding->encoder == encode_coding_raw_text)
7781 translation_table = Qnil, max_lookup = 0;
7782 else
7783 translation_table = get_translation_table (attrs, 1, &max_lookup);
7785 if (BUFFERP (coding->dst_object))
7787 set_buffer_internal (XBUFFER (coding->dst_object));
7788 coding->dst_multibyte
7789 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7792 coding->consumed = coding->consumed_char = 0;
7793 coding->produced = coding->produced_char = 0;
7794 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7796 ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7798 if (coding->encoder == encode_coding_ccl)
7800 coding->spec.ccl = &cclspec;
7801 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7803 do {
7804 coding_set_source (coding);
7805 consume_chars (coding, translation_table, max_lookup);
7806 coding_set_destination (coding);
7807 (*(coding->encoder)) (coding);
7808 } while (coding->consumed_char < coding->src_chars);
7810 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7811 insert_from_gap (coding->produced_char, coding->produced, 0);
7813 SAFE_FREE ();
7817 /* Name (or base name) of work buffer for code conversion. */
7818 static Lisp_Object Vcode_conversion_workbuf_name;
7820 /* A working buffer used by the top level conversion. Once it is
7821 created, it is never destroyed. It has the name
7822 Vcode_conversion_workbuf_name. The other working buffers are
7823 destroyed after the use is finished, and their names are modified
7824 versions of Vcode_conversion_workbuf_name. */
7825 static Lisp_Object Vcode_conversion_reused_workbuf;
7827 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7828 static bool reused_workbuf_in_use;
7831 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7832 multibyteness of returning buffer. */
7834 static Lisp_Object
7835 make_conversion_work_buffer (bool multibyte)
7837 Lisp_Object name, workbuf;
7838 struct buffer *current;
7840 if (reused_workbuf_in_use)
7842 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7843 workbuf = Fget_buffer_create (name);
7845 else
7847 reused_workbuf_in_use = 1;
7848 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7849 Vcode_conversion_reused_workbuf
7850 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7851 workbuf = Vcode_conversion_reused_workbuf;
7853 current = current_buffer;
7854 set_buffer_internal (XBUFFER (workbuf));
7855 /* We can't allow modification hooks to run in the work buffer. For
7856 instance, directory_files_internal assumes that file decoding
7857 doesn't compile new regexps. */
7858 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7859 Ferase_buffer ();
7860 bset_undo_list (current_buffer, Qt);
7861 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7862 set_buffer_internal (current);
7863 return workbuf;
7867 static void
7868 code_conversion_restore (Lisp_Object arg)
7870 Lisp_Object current, workbuf;
7871 struct gcpro gcpro1;
7873 GCPRO1 (arg);
7874 current = XCAR (arg);
7875 workbuf = XCDR (arg);
7876 if (! NILP (workbuf))
7878 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7879 reused_workbuf_in_use = 0;
7880 else
7881 Fkill_buffer (workbuf);
7883 set_buffer_internal (XBUFFER (current));
7884 UNGCPRO;
7887 Lisp_Object
7888 code_conversion_save (bool with_work_buf, bool multibyte)
7890 Lisp_Object workbuf = Qnil;
7892 if (with_work_buf)
7893 workbuf = make_conversion_work_buffer (multibyte);
7894 record_unwind_protect (code_conversion_restore,
7895 Fcons (Fcurrent_buffer (), workbuf));
7896 return workbuf;
7899 void
7900 decode_coding_gap (struct coding_system *coding,
7901 ptrdiff_t chars, ptrdiff_t bytes)
7903 ptrdiff_t count = SPECPDL_INDEX ();
7904 Lisp_Object attrs;
7906 coding->src_object = Fcurrent_buffer ();
7907 coding->src_chars = chars;
7908 coding->src_bytes = bytes;
7909 coding->src_pos = -chars;
7910 coding->src_pos_byte = -bytes;
7911 coding->src_multibyte = chars < bytes;
7912 coding->dst_object = coding->src_object;
7913 coding->dst_pos = PT;
7914 coding->dst_pos_byte = PT_BYTE;
7915 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7917 coding->head_ascii = -1;
7918 coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7919 coding->eol_seen = EOL_SEEN_NONE;
7920 if (CODING_REQUIRE_DETECTION (coding))
7921 detect_coding (coding);
7922 attrs = CODING_ID_ATTRS (coding->id);
7923 if (! disable_ascii_optimization
7924 && ! coding->src_multibyte
7925 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7926 && NILP (CODING_ATTR_POST_READ (attrs))
7927 && NILP (get_translation_table (attrs, 0, NULL)))
7929 chars = coding->head_ascii;
7930 if (chars < 0)
7931 chars = check_ascii (coding);
7932 if (chars != bytes)
7934 /* There exists a non-ASCII byte. */
7935 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7936 && coding->detected_utf8_bytes == coding->src_bytes)
7938 if (coding->detected_utf8_chars >= 0)
7939 chars = coding->detected_utf8_chars;
7940 else
7941 chars = check_utf_8 (coding);
7942 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7943 && coding->head_ascii == 0
7944 && coding->source[0] == UTF_8_BOM_1
7945 && coding->source[1] == UTF_8_BOM_2
7946 && coding->source[2] == UTF_8_BOM_3)
7948 chars--;
7949 bytes -= 3;
7950 coding->src_bytes -= 3;
7953 else
7954 chars = -1;
7956 if (chars >= 0)
7958 Lisp_Object eol_type;
7960 eol_type = CODING_ID_EOL_TYPE (coding->id);
7961 if (VECTORP (eol_type))
7963 if (coding->eol_seen != EOL_SEEN_NONE)
7964 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7966 if (EQ (eol_type, Qmac))
7968 unsigned char *src_end = GAP_END_ADDR;
7969 unsigned char *src = src_end - coding->src_bytes;
7971 while (src < src_end)
7973 if (*src++ == '\r')
7974 src[-1] = '\n';
7977 else if (EQ (eol_type, Qdos))
7979 unsigned char *src = GAP_END_ADDR;
7980 unsigned char *src_beg = src - coding->src_bytes;
7981 unsigned char *dst = src;
7982 ptrdiff_t diff;
7984 while (src_beg < src)
7986 *--dst = *--src;
7987 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7988 src--;
7990 diff = dst - src;
7991 bytes -= diff;
7992 chars -= diff;
7994 coding->produced = bytes;
7995 coding->produced_char = chars;
7996 insert_from_gap (chars, bytes, 1);
7997 return;
8000 code_conversion_save (0, 0);
8002 coding->mode |= CODING_MODE_LAST_BLOCK;
8003 current_buffer->text->inhibit_shrinking = 1;
8004 decode_coding (coding);
8005 current_buffer->text->inhibit_shrinking = 0;
8007 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8009 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8010 Lisp_Object val;
8012 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8013 val = call1 (CODING_ATTR_POST_READ (attrs),
8014 make_number (coding->produced_char));
8015 CHECK_NATNUM (val);
8016 coding->produced_char += Z - prev_Z;
8017 coding->produced += Z_BYTE - prev_Z_BYTE;
8020 unbind_to (count, Qnil);
8024 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8025 SRC_OBJECT into DST_OBJECT by coding context CODING.
8027 SRC_OBJECT is a buffer, a string, or Qnil.
8029 If it is a buffer, the text is at point of the buffer. FROM and TO
8030 are positions in the buffer.
8032 If it is a string, the text is at the beginning of the string.
8033 FROM and TO are indices to the string.
8035 If it is nil, the text is at coding->source. FROM and TO are
8036 indices to coding->source.
8038 DST_OBJECT is a buffer, Qt, or Qnil.
8040 If it is a buffer, the decoded text is inserted at point of the
8041 buffer. If the buffer is the same as SRC_OBJECT, the source text
8042 is deleted.
8044 If it is Qt, a string is made from the decoded text, and
8045 set in CODING->dst_object.
8047 If it is Qnil, the decoded text is stored at CODING->destination.
8048 The caller must allocate CODING->dst_bytes bytes at
8049 CODING->destination by xmalloc. If the decoded text is longer than
8050 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8053 void
8054 decode_coding_object (struct coding_system *coding,
8055 Lisp_Object src_object,
8056 ptrdiff_t from, ptrdiff_t from_byte,
8057 ptrdiff_t to, ptrdiff_t to_byte,
8058 Lisp_Object dst_object)
8060 ptrdiff_t count = SPECPDL_INDEX ();
8061 unsigned char *destination IF_LINT (= NULL);
8062 ptrdiff_t dst_bytes IF_LINT (= 0);
8063 ptrdiff_t chars = to - from;
8064 ptrdiff_t bytes = to_byte - from_byte;
8065 Lisp_Object attrs;
8066 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8067 bool need_marker_adjustment = 0;
8068 Lisp_Object old_deactivate_mark;
8070 old_deactivate_mark = Vdeactivate_mark;
8072 if (NILP (dst_object))
8074 destination = coding->destination;
8075 dst_bytes = coding->dst_bytes;
8078 coding->src_object = src_object;
8079 coding->src_chars = chars;
8080 coding->src_bytes = bytes;
8081 coding->src_multibyte = chars < bytes;
8083 if (STRINGP (src_object))
8085 coding->src_pos = from;
8086 coding->src_pos_byte = from_byte;
8088 else if (BUFFERP (src_object))
8090 set_buffer_internal (XBUFFER (src_object));
8091 if (from != GPT)
8092 move_gap_both (from, from_byte);
8093 if (EQ (src_object, dst_object))
8095 struct Lisp_Marker *tail;
8097 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8099 tail->need_adjustment
8100 = tail->charpos == (tail->insertion_type ? from : to);
8101 need_marker_adjustment |= tail->need_adjustment;
8103 saved_pt = PT, saved_pt_byte = PT_BYTE;
8104 TEMP_SET_PT_BOTH (from, from_byte);
8105 current_buffer->text->inhibit_shrinking = 1;
8106 del_range_both (from, from_byte, to, to_byte, 1);
8107 coding->src_pos = -chars;
8108 coding->src_pos_byte = -bytes;
8110 else
8112 coding->src_pos = from;
8113 coding->src_pos_byte = from_byte;
8117 if (CODING_REQUIRE_DETECTION (coding))
8118 detect_coding (coding);
8119 attrs = CODING_ID_ATTRS (coding->id);
8121 if (EQ (dst_object, Qt)
8122 || (! NILP (CODING_ATTR_POST_READ (attrs))
8123 && NILP (dst_object)))
8125 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8126 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8127 coding->dst_pos = BEG;
8128 coding->dst_pos_byte = BEG_BYTE;
8130 else if (BUFFERP (dst_object))
8132 code_conversion_save (0, 0);
8133 coding->dst_object = dst_object;
8134 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8135 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8136 coding->dst_multibyte
8137 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8139 else
8141 code_conversion_save (0, 0);
8142 coding->dst_object = Qnil;
8143 /* Most callers presume this will return a multibyte result, and they
8144 won't use `binary' or `raw-text' anyway, so let's not worry about
8145 CODING_FOR_UNIBYTE. */
8146 coding->dst_multibyte = 1;
8149 decode_coding (coding);
8151 if (BUFFERP (coding->dst_object))
8152 set_buffer_internal (XBUFFER (coding->dst_object));
8154 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8156 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8157 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8158 Lisp_Object val;
8160 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8161 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8162 old_deactivate_mark);
8163 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8164 make_number (coding->produced_char));
8165 UNGCPRO;
8166 CHECK_NATNUM (val);
8167 coding->produced_char += Z - prev_Z;
8168 coding->produced += Z_BYTE - prev_Z_BYTE;
8171 if (EQ (dst_object, Qt))
8173 coding->dst_object = Fbuffer_string ();
8175 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8177 set_buffer_internal (XBUFFER (coding->dst_object));
8178 if (dst_bytes < coding->produced)
8180 eassert (coding->produced > 0);
8181 destination = xrealloc (destination, coding->produced);
8182 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8183 move_gap_both (BEGV, BEGV_BYTE);
8184 memcpy (destination, BEGV_ADDR, coding->produced);
8185 coding->destination = destination;
8189 if (saved_pt >= 0)
8191 /* This is the case of:
8192 (BUFFERP (src_object) && EQ (src_object, dst_object))
8193 As we have moved PT while replacing the original buffer
8194 contents, we must recover it now. */
8195 set_buffer_internal (XBUFFER (src_object));
8196 current_buffer->text->inhibit_shrinking = 0;
8197 if (saved_pt < from)
8198 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8199 else if (saved_pt < from + chars)
8200 TEMP_SET_PT_BOTH (from, from_byte);
8201 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8202 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8203 saved_pt_byte + (coding->produced - bytes));
8204 else
8205 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8206 saved_pt_byte + (coding->produced - bytes));
8208 if (need_marker_adjustment)
8210 struct Lisp_Marker *tail;
8212 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8213 if (tail->need_adjustment)
8215 tail->need_adjustment = 0;
8216 if (tail->insertion_type)
8218 tail->bytepos = from_byte;
8219 tail->charpos = from;
8221 else
8223 tail->bytepos = from_byte + coding->produced;
8224 tail->charpos
8225 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8226 ? tail->bytepos : from + coding->produced_char);
8232 Vdeactivate_mark = old_deactivate_mark;
8233 unbind_to (count, coding->dst_object);
8237 void
8238 encode_coding_object (struct coding_system *coding,
8239 Lisp_Object src_object,
8240 ptrdiff_t from, ptrdiff_t from_byte,
8241 ptrdiff_t to, ptrdiff_t to_byte,
8242 Lisp_Object dst_object)
8244 ptrdiff_t count = SPECPDL_INDEX ();
8245 ptrdiff_t chars = to - from;
8246 ptrdiff_t bytes = to_byte - from_byte;
8247 Lisp_Object attrs;
8248 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8249 bool need_marker_adjustment = 0;
8250 bool kill_src_buffer = 0;
8251 Lisp_Object old_deactivate_mark;
8253 old_deactivate_mark = Vdeactivate_mark;
8255 coding->src_object = src_object;
8256 coding->src_chars = chars;
8257 coding->src_bytes = bytes;
8258 coding->src_multibyte = chars < bytes;
8260 attrs = CODING_ID_ATTRS (coding->id);
8262 if (EQ (src_object, dst_object))
8264 struct Lisp_Marker *tail;
8266 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8268 tail->need_adjustment
8269 = tail->charpos == (tail->insertion_type ? from : to);
8270 need_marker_adjustment |= tail->need_adjustment;
8274 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8276 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8277 set_buffer_internal (XBUFFER (coding->src_object));
8278 if (STRINGP (src_object))
8279 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8280 else if (BUFFERP (src_object))
8281 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8282 else
8283 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8285 if (EQ (src_object, dst_object))
8287 set_buffer_internal (XBUFFER (src_object));
8288 saved_pt = PT, saved_pt_byte = PT_BYTE;
8289 del_range_both (from, from_byte, to, to_byte, 1);
8290 set_buffer_internal (XBUFFER (coding->src_object));
8294 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8296 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8297 old_deactivate_mark);
8298 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8299 make_number (BEG), make_number (Z));
8300 UNGCPRO;
8302 if (XBUFFER (coding->src_object) != current_buffer)
8303 kill_src_buffer = 1;
8304 coding->src_object = Fcurrent_buffer ();
8305 if (BEG != GPT)
8306 move_gap_both (BEG, BEG_BYTE);
8307 coding->src_chars = Z - BEG;
8308 coding->src_bytes = Z_BYTE - BEG_BYTE;
8309 coding->src_pos = BEG;
8310 coding->src_pos_byte = BEG_BYTE;
8311 coding->src_multibyte = Z < Z_BYTE;
8313 else if (STRINGP (src_object))
8315 code_conversion_save (0, 0);
8316 coding->src_pos = from;
8317 coding->src_pos_byte = from_byte;
8319 else if (BUFFERP (src_object))
8321 code_conversion_save (0, 0);
8322 set_buffer_internal (XBUFFER (src_object));
8323 if (EQ (src_object, dst_object))
8325 saved_pt = PT, saved_pt_byte = PT_BYTE;
8326 coding->src_object = del_range_1 (from, to, 1, 1);
8327 coding->src_pos = 0;
8328 coding->src_pos_byte = 0;
8330 else
8332 if (from < GPT && to >= GPT)
8333 move_gap_both (from, from_byte);
8334 coding->src_pos = from;
8335 coding->src_pos_byte = from_byte;
8338 else
8339 code_conversion_save (0, 0);
8341 if (BUFFERP (dst_object))
8343 coding->dst_object = dst_object;
8344 if (EQ (src_object, dst_object))
8346 coding->dst_pos = from;
8347 coding->dst_pos_byte = from_byte;
8349 else
8351 struct buffer *current = current_buffer;
8353 set_buffer_temp (XBUFFER (dst_object));
8354 coding->dst_pos = PT;
8355 coding->dst_pos_byte = PT_BYTE;
8356 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8357 set_buffer_temp (current);
8359 coding->dst_multibyte
8360 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8362 else if (EQ (dst_object, Qt))
8364 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8365 coding->dst_object = Qnil;
8366 coding->destination = xmalloc (dst_bytes);
8367 coding->dst_bytes = dst_bytes;
8368 coding->dst_multibyte = 0;
8370 else
8372 coding->dst_object = Qnil;
8373 coding->dst_multibyte = 0;
8376 encode_coding (coding);
8378 if (EQ (dst_object, Qt))
8380 if (BUFFERP (coding->dst_object))
8381 coding->dst_object = Fbuffer_string ();
8382 else if (coding->raw_destination)
8383 /* This is used to avoid creating huge Lisp string.
8384 NOTE: caller who sets `raw_destination' is also
8385 responsible for freeing `destination' buffer. */
8386 coding->dst_object = Qnil;
8387 else
8389 coding->dst_object
8390 = make_unibyte_string ((char *) coding->destination,
8391 coding->produced);
8392 xfree (coding->destination);
8396 if (saved_pt >= 0)
8398 /* This is the case of:
8399 (BUFFERP (src_object) && EQ (src_object, dst_object))
8400 As we have moved PT while replacing the original buffer
8401 contents, we must recover it now. */
8402 set_buffer_internal (XBUFFER (src_object));
8403 if (saved_pt < from)
8404 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8405 else if (saved_pt < from + chars)
8406 TEMP_SET_PT_BOTH (from, from_byte);
8407 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8408 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8409 saved_pt_byte + (coding->produced - bytes));
8410 else
8411 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8412 saved_pt_byte + (coding->produced - bytes));
8414 if (need_marker_adjustment)
8416 struct Lisp_Marker *tail;
8418 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8419 if (tail->need_adjustment)
8421 tail->need_adjustment = 0;
8422 if (tail->insertion_type)
8424 tail->bytepos = from_byte;
8425 tail->charpos = from;
8427 else
8429 tail->bytepos = from_byte + coding->produced;
8430 tail->charpos
8431 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8432 ? tail->bytepos : from + coding->produced_char);
8438 if (kill_src_buffer)
8439 Fkill_buffer (coding->src_object);
8441 Vdeactivate_mark = old_deactivate_mark;
8442 unbind_to (count, Qnil);
8446 Lisp_Object
8447 preferred_coding_system (void)
8449 int id = coding_categories[coding_priorities[0]].id;
8451 return CODING_ID_NAME (id);
8454 #if defined (WINDOWSNT) || defined (CYGWIN)
8456 Lisp_Object
8457 from_unicode (Lisp_Object str)
8459 CHECK_STRING (str);
8460 if (!STRING_MULTIBYTE (str) &&
8461 SBYTES (str) & 1)
8463 str = Fsubstring (str, make_number (0), make_number (-1));
8466 return code_convert_string_norecord (str, Qutf_16le, 0);
8469 Lisp_Object
8470 from_unicode_buffer (const wchar_t *wstr)
8472 return from_unicode (
8473 make_unibyte_string (
8474 (char *) wstr,
8475 /* we get one of the two final 0 bytes for free. */
8476 1 + sizeof (wchar_t) * wcslen (wstr)));
8479 wchar_t *
8480 to_unicode (Lisp_Object str, Lisp_Object *buf)
8482 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8483 /* We need to make another copy (in addition to the one made by
8484 code_convert_string_norecord) to ensure that the final string is
8485 _doubly_ zero terminated --- that is, that the string is
8486 terminated by two zero bytes and one utf-16le null character.
8487 Because strings are already terminated with a single zero byte,
8488 we just add one additional zero. */
8489 str = make_uninit_string (SBYTES (*buf) + 1);
8490 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8491 SDATA (str) [SBYTES (*buf)] = '\0';
8492 *buf = str;
8493 return WCSDATA (*buf);
8496 #endif /* WINDOWSNT || CYGWIN */
8499 #ifdef emacs
8500 /*** 8. Emacs Lisp library functions ***/
8502 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8503 doc: /* Return t if OBJECT is nil or a coding-system.
8504 See the documentation of `define-coding-system' for information
8505 about coding-system objects. */)
8506 (Lisp_Object object)
8508 if (NILP (object)
8509 || CODING_SYSTEM_ID (object) >= 0)
8510 return Qt;
8511 if (! SYMBOLP (object)
8512 || NILP (Fget (object, Qcoding_system_define_form)))
8513 return Qnil;
8514 return Qt;
8517 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8518 Sread_non_nil_coding_system, 1, 1, 0,
8519 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8520 (Lisp_Object prompt)
8522 Lisp_Object val;
8525 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8526 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8528 while (SCHARS (val) == 0);
8529 return (Fintern (val, Qnil));
8532 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8533 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8534 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8535 Ignores case when completing coding systems (all Emacs coding systems
8536 are lower-case). */)
8537 (Lisp_Object prompt, Lisp_Object default_coding_system)
8539 Lisp_Object val;
8540 ptrdiff_t count = SPECPDL_INDEX ();
8542 if (SYMBOLP (default_coding_system))
8543 default_coding_system = SYMBOL_NAME (default_coding_system);
8544 specbind (Qcompletion_ignore_case, Qt);
8545 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8546 Qt, Qnil, Qcoding_system_history,
8547 default_coding_system, Qnil);
8548 unbind_to (count, Qnil);
8549 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8552 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8553 1, 1, 0,
8554 doc: /* Check validity of CODING-SYSTEM.
8555 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8556 It is valid if it is nil or a symbol defined as a coding system by the
8557 function `define-coding-system'. */)
8558 (Lisp_Object coding_system)
8560 Lisp_Object define_form;
8562 define_form = Fget (coding_system, Qcoding_system_define_form);
8563 if (! NILP (define_form))
8565 Fput (coding_system, Qcoding_system_define_form, Qnil);
8566 safe_eval (define_form);
8568 if (!NILP (Fcoding_system_p (coding_system)))
8569 return coding_system;
8570 xsignal1 (Qcoding_system_error, coding_system);
8574 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8575 HIGHEST, return the coding system of the highest
8576 priority among the detected coding systems. Otherwise return a
8577 list of detected coding systems sorted by their priorities. If
8578 MULTIBYTEP, it is assumed that the bytes are in correct
8579 multibyte form but contains only ASCII and eight-bit chars.
8580 Otherwise, the bytes are raw bytes.
8582 CODING-SYSTEM controls the detection as below:
8584 If it is nil, detect both text-format and eol-format. If the
8585 text-format part of CODING-SYSTEM is already specified
8586 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8587 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8588 detect only text-format. */
8590 Lisp_Object
8591 detect_coding_system (const unsigned char *src,
8592 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8593 bool highest, bool multibytep,
8594 Lisp_Object coding_system)
8596 const unsigned char *src_end = src + src_bytes;
8597 Lisp_Object attrs, eol_type;
8598 Lisp_Object val = Qnil;
8599 struct coding_system coding;
8600 ptrdiff_t id;
8601 struct coding_detection_info detect_info;
8602 enum coding_category base_category;
8603 bool null_byte_found = 0, eight_bit_found = 0;
8605 if (NILP (coding_system))
8606 coding_system = Qundecided;
8607 setup_coding_system (coding_system, &coding);
8608 attrs = CODING_ID_ATTRS (coding.id);
8609 eol_type = CODING_ID_EOL_TYPE (coding.id);
8610 coding_system = CODING_ATTR_BASE_NAME (attrs);
8612 coding.source = src;
8613 coding.src_chars = src_chars;
8614 coding.src_bytes = src_bytes;
8615 coding.src_multibyte = multibytep;
8616 coding.consumed = 0;
8617 coding.mode |= CODING_MODE_LAST_BLOCK;
8618 coding.head_ascii = 0;
8620 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8622 /* At first, detect text-format if necessary. */
8623 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8624 if (base_category == coding_category_undecided)
8626 enum coding_category category IF_LINT (= 0);
8627 struct coding_system *this IF_LINT (= NULL);
8628 int c, i;
8629 bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8630 inhibit_null_byte_detection);
8631 bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8632 inhibit_iso_escape_detection);
8633 bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8635 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8636 for (; src < src_end; src++)
8638 c = *src;
8639 if (c & 0x80)
8641 eight_bit_found = 1;
8642 if (null_byte_found)
8643 break;
8645 else if (c < 0x20)
8647 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8648 && ! inhibit_ied
8649 && ! detect_info.checked)
8651 if (detect_coding_iso_2022 (&coding, &detect_info))
8653 /* We have scanned the whole data. */
8654 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8656 /* We didn't find an 8-bit code. We may
8657 have found a null-byte, but it's very
8658 rare that a binary file confirm to
8659 ISO-2022. */
8660 src = src_end;
8661 coding.head_ascii = src - coding.source;
8663 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8664 break;
8667 else if (! c && !inhibit_nbd)
8669 null_byte_found = 1;
8670 if (eight_bit_found)
8671 break;
8673 if (! eight_bit_found)
8674 coding.head_ascii++;
8676 else if (! eight_bit_found)
8677 coding.head_ascii++;
8680 if (null_byte_found || eight_bit_found
8681 || coding.head_ascii < coding.src_bytes
8682 || detect_info.found)
8684 if (coding.head_ascii == coding.src_bytes)
8685 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8686 for (i = 0; i < coding_category_raw_text; i++)
8688 category = coding_priorities[i];
8689 this = coding_categories + category;
8690 if (detect_info.found & (1 << category))
8691 break;
8693 else
8695 if (null_byte_found)
8697 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8698 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8700 else if (prefer_utf_8
8701 && detect_coding_utf_8 (&coding, &detect_info))
8703 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8704 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8706 for (i = 0; i < coding_category_raw_text; i++)
8708 category = coding_priorities[i];
8709 this = coding_categories + category;
8711 if (this->id < 0)
8713 /* No coding system of this category is defined. */
8714 detect_info.rejected |= (1 << category);
8716 else if (category >= coding_category_raw_text)
8717 continue;
8718 else if (detect_info.checked & (1 << category))
8720 if (highest
8721 && (detect_info.found & (1 << category)))
8722 break;
8724 else if ((*(this->detector)) (&coding, &detect_info)
8725 && highest
8726 && (detect_info.found & (1 << category)))
8728 if (category == coding_category_utf_16_auto)
8730 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8731 category = coding_category_utf_16_le;
8732 else
8733 category = coding_category_utf_16_be;
8735 break;
8741 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8742 || null_byte_found)
8744 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8745 id = CODING_SYSTEM_ID (Qno_conversion);
8746 val = list1 (make_number (id));
8748 else if (! detect_info.rejected && ! detect_info.found)
8750 detect_info.found = CATEGORY_MASK_ANY;
8751 id = coding_categories[coding_category_undecided].id;
8752 val = list1 (make_number (id));
8754 else if (highest)
8756 if (detect_info.found)
8758 detect_info.found = 1 << category;
8759 val = list1 (make_number (this->id));
8761 else
8762 for (i = 0; i < coding_category_raw_text; i++)
8763 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8765 detect_info.found = 1 << coding_priorities[i];
8766 id = coding_categories[coding_priorities[i]].id;
8767 val = list1 (make_number (id));
8768 break;
8771 else
8773 int mask = detect_info.rejected | detect_info.found;
8774 int found = 0;
8776 for (i = coding_category_raw_text - 1; i >= 0; i--)
8778 category = coding_priorities[i];
8779 if (! (mask & (1 << category)))
8781 found |= 1 << category;
8782 id = coding_categories[category].id;
8783 if (id >= 0)
8784 val = list1 (make_number (id));
8787 for (i = coding_category_raw_text - 1; i >= 0; i--)
8789 category = coding_priorities[i];
8790 if (detect_info.found & (1 << category))
8792 id = coding_categories[category].id;
8793 val = Fcons (make_number (id), val);
8796 detect_info.found |= found;
8799 else if (base_category == coding_category_utf_8_auto)
8801 if (detect_coding_utf_8 (&coding, &detect_info))
8803 struct coding_system *this;
8805 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8806 this = coding_categories + coding_category_utf_8_sig;
8807 else
8808 this = coding_categories + coding_category_utf_8_nosig;
8809 val = list1 (make_number (this->id));
8812 else if (base_category == coding_category_utf_16_auto)
8814 if (detect_coding_utf_16 (&coding, &detect_info))
8816 struct coding_system *this;
8818 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8819 this = coding_categories + coding_category_utf_16_le;
8820 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8821 this = coding_categories + coding_category_utf_16_be;
8822 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8823 this = coding_categories + coding_category_utf_16_be_nosig;
8824 else
8825 this = coding_categories + coding_category_utf_16_le_nosig;
8826 val = list1 (make_number (this->id));
8829 else
8831 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8832 val = list1 (make_number (coding.id));
8835 /* Then, detect eol-format if necessary. */
8837 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8838 Lisp_Object tail;
8840 if (VECTORP (eol_type))
8842 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8844 if (null_byte_found)
8845 normal_eol = EOL_SEEN_LF;
8846 else
8847 normal_eol = detect_eol (coding.source, src_bytes,
8848 coding_category_raw_text);
8850 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8851 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8852 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8853 coding_category_utf_16_be);
8854 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8855 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8856 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8857 coding_category_utf_16_le);
8859 else
8861 if (EQ (eol_type, Qunix))
8862 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8863 else if (EQ (eol_type, Qdos))
8864 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8865 else
8866 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8869 for (tail = val; CONSP (tail); tail = XCDR (tail))
8871 enum coding_category category;
8872 int this_eol;
8874 id = XINT (XCAR (tail));
8875 attrs = CODING_ID_ATTRS (id);
8876 category = XINT (CODING_ATTR_CATEGORY (attrs));
8877 eol_type = CODING_ID_EOL_TYPE (id);
8878 if (VECTORP (eol_type))
8880 if (category == coding_category_utf_16_be
8881 || category == coding_category_utf_16_be_nosig)
8882 this_eol = utf_16_be_eol;
8883 else if (category == coding_category_utf_16_le
8884 || category == coding_category_utf_16_le_nosig)
8885 this_eol = utf_16_le_eol;
8886 else
8887 this_eol = normal_eol;
8889 if (this_eol == EOL_SEEN_LF)
8890 XSETCAR (tail, AREF (eol_type, 0));
8891 else if (this_eol == EOL_SEEN_CRLF)
8892 XSETCAR (tail, AREF (eol_type, 1));
8893 else if (this_eol == EOL_SEEN_CR)
8894 XSETCAR (tail, AREF (eol_type, 2));
8895 else
8896 XSETCAR (tail, CODING_ID_NAME (id));
8898 else
8899 XSETCAR (tail, CODING_ID_NAME (id));
8903 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8907 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8908 2, 3, 0,
8909 doc: /* Detect coding system of the text in the region between START and END.
8910 Return a list of possible coding systems ordered by priority.
8911 The coding systems to try and their priorities follows what
8912 the function `coding-system-priority-list' (which see) returns.
8914 If only ASCII characters are found (except for such ISO-2022 control
8915 characters as ESC), it returns a list of single element `undecided'
8916 or its subsidiary coding system according to a detected end-of-line
8917 format.
8919 If optional argument HIGHEST is non-nil, return the coding system of
8920 highest priority. */)
8921 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8923 ptrdiff_t from, to;
8924 ptrdiff_t from_byte, to_byte;
8926 validate_region (&start, &end);
8927 from = XINT (start), to = XINT (end);
8928 from_byte = CHAR_TO_BYTE (from);
8929 to_byte = CHAR_TO_BYTE (to);
8931 if (from < GPT && to >= GPT)
8932 move_gap_both (to, to_byte);
8934 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8935 to - from, to_byte - from_byte,
8936 !NILP (highest),
8937 !NILP (BVAR (current_buffer
8938 , enable_multibyte_characters)),
8939 Qnil);
8942 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8943 1, 2, 0,
8944 doc: /* Detect coding system of the text in STRING.
8945 Return a list of possible coding systems ordered by priority.
8946 The coding systems to try and their priorities follows what
8947 the function `coding-system-priority-list' (which see) returns.
8949 If only ASCII characters are found (except for such ISO-2022 control
8950 characters as ESC), it returns a list of single element `undecided'
8951 or its subsidiary coding system according to a detected end-of-line
8952 format.
8954 If optional argument HIGHEST is non-nil, return the coding system of
8955 highest priority. */)
8956 (Lisp_Object string, Lisp_Object highest)
8958 CHECK_STRING (string);
8960 return detect_coding_system (SDATA (string),
8961 SCHARS (string), SBYTES (string),
8962 !NILP (highest), STRING_MULTIBYTE (string),
8963 Qnil);
8967 static bool
8968 char_encodable_p (int c, Lisp_Object attrs)
8970 Lisp_Object tail;
8971 struct charset *charset;
8972 Lisp_Object translation_table;
8974 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8975 if (! NILP (translation_table))
8976 c = translate_char (translation_table, c);
8977 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8978 CONSP (tail); tail = XCDR (tail))
8980 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8981 if (CHAR_CHARSET_P (c, charset))
8982 break;
8984 return (! NILP (tail));
8988 /* Return a list of coding systems that safely encode the text between
8989 START and END. If EXCLUDE is non-nil, it is a list of coding
8990 systems not to check. The returned list doesn't contain any such
8991 coding systems. In any case, if the text contains only ASCII or is
8992 unibyte, return t. */
8994 DEFUN ("find-coding-systems-region-internal",
8995 Ffind_coding_systems_region_internal,
8996 Sfind_coding_systems_region_internal, 2, 3, 0,
8997 doc: /* Internal use only. */)
8998 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
9000 Lisp_Object coding_attrs_list, safe_codings;
9001 ptrdiff_t start_byte, end_byte;
9002 const unsigned char *p, *pbeg, *pend;
9003 int c;
9004 Lisp_Object tail, elt, work_table;
9006 if (STRINGP (start))
9008 if (!STRING_MULTIBYTE (start)
9009 || SCHARS (start) == SBYTES (start))
9010 return Qt;
9011 start_byte = 0;
9012 end_byte = SBYTES (start);
9014 else
9016 CHECK_NUMBER_COERCE_MARKER (start);
9017 CHECK_NUMBER_COERCE_MARKER (end);
9018 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9019 args_out_of_range (start, end);
9020 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9021 return Qt;
9022 start_byte = CHAR_TO_BYTE (XINT (start));
9023 end_byte = CHAR_TO_BYTE (XINT (end));
9024 if (XINT (end) - XINT (start) == end_byte - start_byte)
9025 return Qt;
9027 if (XINT (start) < GPT && XINT (end) > GPT)
9029 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9030 move_gap_both (XINT (start), start_byte);
9031 else
9032 move_gap_both (XINT (end), end_byte);
9036 coding_attrs_list = Qnil;
9037 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9038 if (NILP (exclude)
9039 || NILP (Fmemq (XCAR (tail), exclude)))
9041 Lisp_Object attrs;
9043 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9044 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9046 ASET (attrs, coding_attr_trans_tbl,
9047 get_translation_table (attrs, 1, NULL));
9048 coding_attrs_list = Fcons (attrs, coding_attrs_list);
9052 if (STRINGP (start))
9053 p = pbeg = SDATA (start);
9054 else
9055 p = pbeg = BYTE_POS_ADDR (start_byte);
9056 pend = p + (end_byte - start_byte);
9058 while (p < pend && ASCII_CHAR_P (*p)) p++;
9059 while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9061 work_table = Fmake_char_table (Qnil, Qnil);
9062 while (p < pend)
9064 if (ASCII_CHAR_P (*p))
9065 p++;
9066 else
9068 c = STRING_CHAR_ADVANCE (p);
9069 if (!NILP (char_table_ref (work_table, c)))
9070 /* This character was already checked. Ignore it. */
9071 continue;
9073 charset_map_loaded = 0;
9074 for (tail = coding_attrs_list; CONSP (tail);)
9076 elt = XCAR (tail);
9077 if (NILP (elt))
9078 tail = XCDR (tail);
9079 else if (char_encodable_p (c, elt))
9080 tail = XCDR (tail);
9081 else if (CONSP (XCDR (tail)))
9083 XSETCAR (tail, XCAR (XCDR (tail)));
9084 XSETCDR (tail, XCDR (XCDR (tail)));
9086 else
9088 XSETCAR (tail, Qnil);
9089 tail = XCDR (tail);
9092 if (charset_map_loaded)
9094 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9096 if (STRINGP (start))
9097 pbeg = SDATA (start);
9098 else
9099 pbeg = BYTE_POS_ADDR (start_byte);
9100 p = pbeg + p_offset;
9101 pend = pbeg + pend_offset;
9103 char_table_set (work_table, c, Qt);
9107 safe_codings = list2 (Qraw_text, Qno_conversion);
9108 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9109 if (! NILP (XCAR (tail)))
9110 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9112 return safe_codings;
9116 DEFUN ("unencodable-char-position", Funencodable_char_position,
9117 Sunencodable_char_position, 3, 5, 0,
9118 doc: /* Return position of first un-encodable character in a region.
9119 START and END specify the region and CODING-SYSTEM specifies the
9120 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9122 If optional 4th argument COUNT is non-nil, it specifies at most how
9123 many un-encodable characters to search. In this case, the value is a
9124 list of positions.
9126 If optional 5th argument STRING is non-nil, it is a string to search
9127 for un-encodable characters. In that case, START and END are indexes
9128 to the string and treated as in `substring'. */)
9129 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9130 Lisp_Object count, Lisp_Object string)
9132 EMACS_INT n;
9133 struct coding_system coding;
9134 Lisp_Object attrs, charset_list, translation_table;
9135 Lisp_Object positions;
9136 ptrdiff_t from, to;
9137 const unsigned char *p, *stop, *pend;
9138 bool ascii_compatible;
9140 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9141 attrs = CODING_ID_ATTRS (coding.id);
9142 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9143 return Qnil;
9144 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9145 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9146 translation_table = get_translation_table (attrs, 1, NULL);
9148 if (NILP (string))
9150 validate_region (&start, &end);
9151 from = XINT (start);
9152 to = XINT (end);
9153 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9154 || (ascii_compatible
9155 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9156 return Qnil;
9157 p = CHAR_POS_ADDR (from);
9158 pend = CHAR_POS_ADDR (to);
9159 if (from < GPT && to >= GPT)
9160 stop = GPT_ADDR;
9161 else
9162 stop = pend;
9164 else
9166 CHECK_STRING (string);
9167 validate_subarray (string, start, end, SCHARS (string), &from, &to);
9168 if (! STRING_MULTIBYTE (string))
9169 return Qnil;
9170 p = SDATA (string) + string_char_to_byte (string, from);
9171 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9172 if (ascii_compatible && (to - from) == (pend - p))
9173 return Qnil;
9176 if (NILP (count))
9177 n = 1;
9178 else
9180 CHECK_NATNUM (count);
9181 n = XINT (count);
9184 positions = Qnil;
9185 charset_map_loaded = 0;
9186 while (1)
9188 int c;
9190 if (ascii_compatible)
9191 while (p < stop && ASCII_CHAR_P (*p))
9192 p++, from++;
9193 if (p >= stop)
9195 if (p >= pend)
9196 break;
9197 stop = pend;
9198 p = GAP_END_ADDR;
9201 c = STRING_CHAR_ADVANCE (p);
9202 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9203 && ! char_charset (translate_char (translation_table, c),
9204 charset_list, NULL))
9206 positions = Fcons (make_number (from), positions);
9207 n--;
9208 if (n == 0)
9209 break;
9212 from++;
9213 if (charset_map_loaded && NILP (string))
9215 p = CHAR_POS_ADDR (from);
9216 pend = CHAR_POS_ADDR (to);
9217 if (from < GPT && to >= GPT)
9218 stop = GPT_ADDR;
9219 else
9220 stop = pend;
9221 charset_map_loaded = 0;
9225 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9230 Scheck_coding_systems_region, 3, 3, 0,
9231 doc: /* Check if the region is encodable by coding systems.
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9242 value is nil.
9244 START may be a string. In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions. END is ignored.
9248 If the current buffer (or START if it is a string) is unibyte, the value
9249 is nil. */)
9250 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9252 Lisp_Object list;
9253 ptrdiff_t start_byte, end_byte;
9254 ptrdiff_t pos;
9255 const unsigned char *p, *pbeg, *pend;
9256 int c;
9257 Lisp_Object tail, elt, attrs;
9259 if (STRINGP (start))
9261 if (!STRING_MULTIBYTE (start)
9262 || SCHARS (start) == SBYTES (start))
9263 return Qnil;
9264 start_byte = 0;
9265 end_byte = SBYTES (start);
9266 pos = 0;
9268 else
9270 CHECK_NUMBER_COERCE_MARKER (start);
9271 CHECK_NUMBER_COERCE_MARKER (end);
9272 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9273 args_out_of_range (start, end);
9274 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9275 return Qnil;
9276 start_byte = CHAR_TO_BYTE (XINT (start));
9277 end_byte = CHAR_TO_BYTE (XINT (end));
9278 if (XINT (end) - XINT (start) == end_byte - start_byte)
9279 return Qnil;
9281 if (XINT (start) < GPT && XINT (end) > GPT)
9283 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9284 move_gap_both (XINT (start), start_byte);
9285 else
9286 move_gap_both (XINT (end), end_byte);
9288 pos = XINT (start);
9291 list = Qnil;
9292 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9294 elt = XCAR (tail);
9295 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9296 ASET (attrs, coding_attr_trans_tbl,
9297 get_translation_table (attrs, 1, NULL));
9298 list = Fcons (list2 (elt, attrs), list);
9301 if (STRINGP (start))
9302 p = pbeg = SDATA (start);
9303 else
9304 p = pbeg = BYTE_POS_ADDR (start_byte);
9305 pend = p + (end_byte - start_byte);
9307 while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9308 while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9310 while (p < pend)
9312 if (ASCII_CHAR_P (*p))
9313 p++;
9314 else
9316 c = STRING_CHAR_ADVANCE (p);
9318 charset_map_loaded = 0;
9319 for (tail = list; CONSP (tail); tail = XCDR (tail))
9321 elt = XCDR (XCAR (tail));
9322 if (! char_encodable_p (c, XCAR (elt)))
9323 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9325 if (charset_map_loaded)
9327 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9329 if (STRINGP (start))
9330 pbeg = SDATA (start);
9331 else
9332 pbeg = BYTE_POS_ADDR (start_byte);
9333 p = pbeg + p_offset;
9334 pend = pbeg + pend_offset;
9337 pos++;
9340 tail = list;
9341 list = Qnil;
9342 for (; CONSP (tail); tail = XCDR (tail))
9344 elt = XCAR (tail);
9345 if (CONSP (XCDR (XCDR (elt))))
9346 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9347 list);
9350 return list;
9354 static Lisp_Object
9355 code_convert_region (Lisp_Object start, Lisp_Object end,
9356 Lisp_Object coding_system, Lisp_Object dst_object,
9357 bool encodep, bool norecord)
9359 struct coding_system coding;
9360 ptrdiff_t from, from_byte, to, to_byte;
9361 Lisp_Object src_object;
9363 if (NILP (coding_system))
9364 coding_system = Qno_conversion;
9365 else
9366 CHECK_CODING_SYSTEM (coding_system);
9367 src_object = Fcurrent_buffer ();
9368 if (NILP (dst_object))
9369 dst_object = src_object;
9370 else if (! EQ (dst_object, Qt))
9371 CHECK_BUFFER (dst_object);
9373 validate_region (&start, &end);
9374 from = XFASTINT (start);
9375 from_byte = CHAR_TO_BYTE (from);
9376 to = XFASTINT (end);
9377 to_byte = CHAR_TO_BYTE (to);
9379 setup_coding_system (coding_system, &coding);
9380 coding.mode |= CODING_MODE_LAST_BLOCK;
9382 if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9384 struct buffer *buf = XBUFFER (dst_object);
9385 ptrdiff_t buf_pt = BUF_PT (buf);
9387 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9390 if (encodep)
9391 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392 dst_object);
9393 else
9394 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9395 dst_object);
9396 if (! norecord)
9397 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9399 return (BUFFERP (dst_object)
9400 ? make_number (coding.produced_char)
9401 : coding.dst_object);
9405 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9406 3, 4, "r\nzCoding system: ",
9407 doc: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409 START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9415 does not move).
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.) */)
9422 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9424 return code_convert_region (start, end, coding_system, destination, 0, 0);
9427 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9428 3, 4, "r\nzCoding system: ",
9429 doc: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431 START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9434 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replace by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9437 does not move).
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.) */)
9444 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9446 return code_convert_region (start, end, coding_system, destination, 1, 0);
9449 Lisp_Object
9450 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9451 Lisp_Object dst_object, bool encodep, bool nocopy,
9452 bool norecord)
9454 struct coding_system coding;
9455 ptrdiff_t chars, bytes;
9457 CHECK_STRING (string);
9458 if (NILP (coding_system))
9460 if (! norecord)
9461 Vlast_coding_system_used = Qno_conversion;
9462 if (NILP (dst_object))
9463 return (nocopy ? Fcopy_sequence (string) : string);
9466 if (NILP (coding_system))
9467 coding_system = Qno_conversion;
9468 else
9469 CHECK_CODING_SYSTEM (coding_system);
9470 if (NILP (dst_object))
9471 dst_object = Qt;
9472 else if (! EQ (dst_object, Qt))
9473 CHECK_BUFFER (dst_object);
9475 setup_coding_system (coding_system, &coding);
9476 coding.mode |= CODING_MODE_LAST_BLOCK;
9477 chars = SCHARS (string);
9478 bytes = SBYTES (string);
9480 if (BUFFERP (dst_object))
9482 struct buffer *buf = XBUFFER (dst_object);
9483 ptrdiff_t buf_pt = BUF_PT (buf);
9485 invalidate_buffer_caches (buf, buf_pt, buf_pt);
9488 if (encodep)
9489 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9490 else
9491 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9492 if (! norecord)
9493 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9495 return (BUFFERP (dst_object)
9496 ? make_number (coding.produced_char)
9497 : coding.dst_object);
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502 Do not set Vlast_coding_system_used.
9504 This function is called only from macros DECODE_FILE and
9505 ENCODE_FILE, thus we ignore character composition. */
9507 Lisp_Object
9508 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9509 bool encodep)
9511 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515 for passing to C library functions. */
9516 Lisp_Object
9517 decode_file_name (Lisp_Object fname)
9519 #ifdef WINDOWSNT
9520 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521 converts the file names either to UTF-16LE or to the system ANSI
9522 codepage internally, depending on the underlying OS; see w32.c. */
9523 if (! NILP (Fcoding_system_p (Qutf_8)))
9524 return code_convert_string_norecord (fname, Qutf_8, 0);
9525 return fname;
9526 #else /* !WINDOWSNT */
9527 if (! NILP (Vfile_name_coding_system))
9528 return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9529 else if (! NILP (Vdefault_file_name_coding_system))
9530 return code_convert_string_norecord (fname,
9531 Vdefault_file_name_coding_system, 0);
9532 else
9533 return fname;
9534 #endif
9537 Lisp_Object
9538 encode_file_name (Lisp_Object fname)
9540 /* This is especially important during bootstrap and dumping, when
9541 file-name encoding is not yet known, and therefore any non-ASCII
9542 file names are unibyte strings, and could only be thrashed if we
9543 try to encode them. */
9544 if (!STRING_MULTIBYTE (fname))
9545 return fname;
9546 #ifdef WINDOWSNT
9547 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548 converts the file names either to UTF-16LE or to the system ANSI
9549 codepage internally, depending on the underlying OS; see w32.c. */
9550 if (! NILP (Fcoding_system_p (Qutf_8)))
9551 return code_convert_string_norecord (fname, Qutf_8, 1);
9552 return fname;
9553 #else /* !WINDOWSNT */
9554 if (! NILP (Vfile_name_coding_system))
9555 return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9556 else if (! NILP (Vdefault_file_name_coding_system))
9557 return code_convert_string_norecord (fname,
9558 Vdefault_file_name_coding_system, 1);
9559 else
9560 return fname;
9561 #endif
9564 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9565 2, 4, 0,
9566 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move). In this
9573 case, the return value is the length of the decoded text.
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.) */)
9578 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9580 return code_convert_string (string, coding_system, buffer,
9581 0, ! NILP (nocopy), 0);
9584 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9585 2, 4, 0,
9586 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move). In this
9593 case, the return value is the length of the encoded text.
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.) */)
9598 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9600 return code_convert_string (string, coding_system, buffer,
9601 1, ! NILP (nocopy), 0);
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9606 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character. */)
9608 (Lisp_Object code)
9610 Lisp_Object spec, attrs, val;
9611 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9612 EMACS_INT ch;
9613 int c;
9615 CHECK_NATNUM (code);
9616 ch = XFASTINT (code);
9617 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618 attrs = AREF (spec, 0);
9620 if (ASCII_CHAR_P (ch)
9621 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622 return code;
9624 val = CODING_ATTR_CHARSET_LIST (attrs);
9625 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9627 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9629 if (ch <= 0x7F)
9631 c = ch;
9632 charset = charset_roman;
9634 else if (ch >= 0xA0 && ch < 0xDF)
9636 c = ch - 0x80;
9637 charset = charset_kana;
9639 else
9641 EMACS_INT c1 = ch >> 8;
9642 int c2 = ch & 0xFF;
9644 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9645 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9646 error ("Invalid code: %"pI"d", ch);
9647 c = ch;
9648 SJIS_TO_JIS (c);
9649 charset = charset_kanji;
9651 c = DECODE_CHAR (charset, c);
9652 if (c < 0)
9653 error ("Invalid code: %"pI"d", ch);
9654 return make_number (c);
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9659 doc: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS. */)
9661 (Lisp_Object ch)
9663 Lisp_Object spec, attrs, charset_list;
9664 int c;
9665 struct charset *charset;
9666 unsigned code;
9668 CHECK_CHARACTER (ch);
9669 c = XFASTINT (ch);
9670 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9671 attrs = AREF (spec, 0);
9673 if (ASCII_CHAR_P (c)
9674 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9675 return ch;
9677 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9678 charset = char_charset (c, charset_list, &code);
9679 if (code == CHARSET_INVALID_CODE (charset))
9680 error ("Can't encode by shift_jis encoding: %c", c);
9681 JIS_TO_SJIS (code);
9683 return make_number (code);
9686 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9687 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character. */)
9689 (Lisp_Object code)
9691 Lisp_Object spec, attrs, val;
9692 struct charset *charset_roman, *charset_big5, *charset;
9693 EMACS_INT ch;
9694 int c;
9696 CHECK_NATNUM (code);
9697 ch = XFASTINT (code);
9698 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699 attrs = AREF (spec, 0);
9701 if (ASCII_CHAR_P (ch)
9702 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9703 return code;
9705 val = CODING_ATTR_CHARSET_LIST (attrs);
9706 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9707 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9709 if (ch <= 0x7F)
9711 c = ch;
9712 charset = charset_roman;
9714 else
9716 EMACS_INT b1 = ch >> 8;
9717 int b2 = ch & 0x7F;
9718 if (b1 < 0xA1 || b1 > 0xFE
9719 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9720 error ("Invalid code: %"pI"d", ch);
9721 c = ch;
9722 charset = charset_big5;
9724 c = DECODE_CHAR (charset, c);
9725 if (c < 0)
9726 error ("Invalid code: %"pI"d", ch);
9727 return make_number (c);
9730 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9731 doc: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5. */)
9733 (Lisp_Object ch)
9735 Lisp_Object spec, attrs, charset_list;
9736 struct charset *charset;
9737 int c;
9738 unsigned code;
9740 CHECK_CHARACTER (ch);
9741 c = XFASTINT (ch);
9742 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9743 attrs = AREF (spec, 0);
9744 if (ASCII_CHAR_P (c)
9745 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9746 return ch;
9748 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9749 charset = char_charset (c, charset_list, &code);
9750 if (code == CHARSET_INVALID_CODE (charset))
9751 error ("Can't encode by Big5 encoding: %c", c);
9753 return make_number (code);
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9758 Sset_terminal_coding_system_internal, 1, 2, 0,
9759 doc: /* Internal use only. */)
9760 (Lisp_Object coding_system, Lisp_Object terminal)
9762 struct terminal *term = decode_live_terminal (terminal);
9763 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9764 CHECK_SYMBOL (coding_system);
9765 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9766 /* We had better not send unsafe characters to terminal. */
9767 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9768 /* Character composition should be disabled. */
9769 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770 terminal_coding->src_multibyte = 1;
9771 terminal_coding->dst_multibyte = 0;
9772 tset_charset_list
9773 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9774 ? coding_charset_list (terminal_coding)
9775 : list1 (make_number (charset_ascii))));
9776 return Qnil;
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780 Fset_safe_terminal_coding_system_internal,
9781 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9782 doc: /* Internal use only. */)
9783 (Lisp_Object coding_system)
9785 CHECK_SYMBOL (coding_system);
9786 setup_coding_system (Fcheck_coding_system (coding_system),
9787 &safe_terminal_coding);
9788 /* Character composition should be disabled. */
9789 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9790 safe_terminal_coding.src_multibyte = 1;
9791 safe_terminal_coding.dst_multibyte = 0;
9792 return Qnil;
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9796 Sterminal_coding_system, 0, 1, 0,
9797 doc: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device. */)
9800 (Lisp_Object terminal)
9802 struct coding_system *terminal_coding
9803 = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9804 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9806 /* For backward compatibility, return nil if it is `undecided'. */
9807 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9811 Sset_keyboard_coding_system_internal, 1, 2, 0,
9812 doc: /* Internal use only. */)
9813 (Lisp_Object coding_system, Lisp_Object terminal)
9815 struct terminal *t = decode_live_terminal (terminal);
9816 CHECK_SYMBOL (coding_system);
9817 if (NILP (coding_system))
9818 coding_system = Qno_conversion;
9819 else
9820 Fcheck_coding_system (coding_system);
9821 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9822 /* Character composition should be disabled. */
9823 TERMINAL_KEYBOARD_CODING (t)->common_flags
9824 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9825 return Qnil;
9828 DEFUN ("keyboard-coding-system",
9829 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9830 doc: /* Return coding system specified for decoding keyboard input. */)
9831 (Lisp_Object terminal)
9833 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834 (decode_live_terminal (terminal))->id);
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9839 Sfind_operation_coding_system, 1, MANY, 0,
9840 doc: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9846 The first argument OPERATION specifies an I/O primitive:
9847 For file I/O, `insert-file-contents' or `write-region'.
9848 For process I/O, `call-process', `call-process-region', or `start-process'.
9849 For network I/O, `open-network-stream'.
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive. Depending on which primitive, one of those arguments
9853 is selected as the TARGET. For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9856 TARGET has a meaning which depends on OPERATION:
9857 For file I/O, TARGET is a file name (except for the special case below).
9858 For process I/O, TARGET is a process name.
9859 For network I/O, TARGET is a service name or a port number.
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded). If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9879 (ptrdiff_t nargs, Lisp_Object *args)
9881 Lisp_Object operation, target_idx, target, val;
9882 register Lisp_Object chain;
9884 if (nargs < 2)
9885 error ("Too few arguments");
9886 operation = args[0];
9887 if (!SYMBOLP (operation)
9888 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9889 error ("Invalid first argument");
9890 if (nargs <= 1 + XFASTINT (target_idx))
9891 error ("Too few arguments for operation `%s'",
9892 SDATA (SYMBOL_NAME (operation)));
9893 target = args[XFASTINT (target_idx) + 1];
9894 if (!(STRINGP (target)
9895 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9896 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9897 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9898 error ("Invalid argument %"pI"d of operation `%s'",
9899 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9900 if (CONSP (target))
9901 target = XCAR (target);
9903 chain = ((EQ (operation, Qinsert_file_contents)
9904 || EQ (operation, Qwrite_region))
9905 ? Vfile_coding_system_alist
9906 : (EQ (operation, Qopen_network_stream)
9907 ? Vnetwork_coding_system_alist
9908 : Vprocess_coding_system_alist));
9909 if (NILP (chain))
9910 return Qnil;
9912 for (; CONSP (chain); chain = XCDR (chain))
9914 Lisp_Object elt;
9916 elt = XCAR (chain);
9917 if (CONSP (elt)
9918 && ((STRINGP (target)
9919 && STRINGP (XCAR (elt))
9920 && fast_string_match (XCAR (elt), target) >= 0)
9921 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9923 val = XCDR (elt);
9924 /* Here, if VAL is both a valid coding system and a valid
9925 function symbol, we return VAL as a coding system. */
9926 if (CONSP (val))
9927 return val;
9928 if (! SYMBOLP (val))
9929 return Qnil;
9930 if (! NILP (Fcoding_system_p (val)))
9931 return Fcons (val, val);
9932 if (! NILP (Ffboundp (val)))
9934 /* We use call1 rather than safe_call1
9935 so as to get bug reports about functions called here
9936 which don't handle the current interface. */
9937 val = call1 (val, Flist (nargs, args));
9938 if (CONSP (val))
9939 return val;
9940 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9941 return Fcons (val, val);
9943 return Qnil;
9946 return Qnil;
9949 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9950 Sset_coding_system_priority, 0, MANY, 0,
9951 doc: /* Assign higher priority to the coding systems given as arguments.
9952 If multiple coding systems belong to the same category,
9953 all but the first one are ignored.
9955 usage: (set-coding-system-priority &rest coding-systems) */)
9956 (ptrdiff_t nargs, Lisp_Object *args)
9958 ptrdiff_t i, j;
9959 bool changed[coding_category_max];
9960 enum coding_category priorities[coding_category_max];
9962 memset (changed, 0, sizeof changed);
9964 for (i = j = 0; i < nargs; i++)
9966 enum coding_category category;
9967 Lisp_Object spec, attrs;
9969 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9970 attrs = AREF (spec, 0);
9971 category = XINT (CODING_ATTR_CATEGORY (attrs));
9972 if (changed[category])
9973 /* Ignore this coding system because a coding system of the
9974 same category already had a higher priority. */
9975 continue;
9976 changed[category] = 1;
9977 priorities[j++] = category;
9978 if (coding_categories[category].id >= 0
9979 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9980 setup_coding_system (args[i], &coding_categories[category]);
9981 Fset (AREF (Vcoding_category_table, category), args[i]);
9984 /* Now we have decided top J priorities. Reflect the order of the
9985 original priorities to the remaining priorities. */
9987 for (i = j, j = 0; i < coding_category_max; i++, j++)
9989 while (j < coding_category_max
9990 && changed[coding_priorities[j]])
9991 j++;
9992 if (j == coding_category_max)
9993 emacs_abort ();
9994 priorities[i] = coding_priorities[j];
9997 memcpy (coding_priorities, priorities, sizeof priorities);
9999 /* Update `coding-category-list'. */
10000 Vcoding_category_list = Qnil;
10001 for (i = coding_category_max; i-- > 0; )
10002 Vcoding_category_list
10003 = Fcons (AREF (Vcoding_category_table, priorities[i]),
10004 Vcoding_category_list);
10006 return Qnil;
10009 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10010 Scoding_system_priority_list, 0, 1, 0,
10011 doc: /* Return a list of coding systems ordered by their priorities.
10012 The list contains a subset of coding systems; i.e. coding systems
10013 assigned to each coding category (see `coding-category-list').
10015 HIGHESTP non-nil means just return the highest priority one. */)
10016 (Lisp_Object highestp)
10018 int i;
10019 Lisp_Object val;
10021 for (i = 0, val = Qnil; i < coding_category_max; i++)
10023 enum coding_category category = coding_priorities[i];
10024 int id = coding_categories[category].id;
10025 Lisp_Object attrs;
10027 if (id < 0)
10028 continue;
10029 attrs = CODING_ID_ATTRS (id);
10030 if (! NILP (highestp))
10031 return CODING_ATTR_BASE_NAME (attrs);
10032 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10034 return Fnreverse (val);
10037 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10039 static Lisp_Object
10040 make_subsidiaries (Lisp_Object base)
10042 Lisp_Object subsidiaries;
10043 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10044 USE_SAFE_ALLOCA;
10045 char *buf = SAFE_ALLOCA (base_name_len + 6);
10046 int i;
10048 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10049 subsidiaries = make_uninit_vector (3);
10050 for (i = 0; i < 3; i++)
10052 strcpy (buf + base_name_len, suffixes[i]);
10053 ASET (subsidiaries, i, intern (buf));
10055 SAFE_FREE ();
10056 return subsidiaries;
10060 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10061 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10062 doc: /* For internal use only.
10063 usage: (define-coding-system-internal ...) */)
10064 (ptrdiff_t nargs, Lisp_Object *args)
10066 Lisp_Object name;
10067 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
10068 Lisp_Object attrs; /* Vector of attributes. */
10069 Lisp_Object eol_type;
10070 Lisp_Object aliases;
10071 Lisp_Object coding_type, charset_list, safe_charsets;
10072 enum coding_category category;
10073 Lisp_Object tail, val;
10074 int max_charset_id = 0;
10075 int i;
10077 if (nargs < coding_arg_max)
10078 goto short_args;
10080 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10082 name = args[coding_arg_name];
10083 CHECK_SYMBOL (name);
10084 ASET (attrs, coding_attr_base_name, name);
10086 val = args[coding_arg_mnemonic];
10087 if (! STRINGP (val))
10088 CHECK_CHARACTER (val);
10089 ASET (attrs, coding_attr_mnemonic, val);
10091 coding_type = args[coding_arg_coding_type];
10092 CHECK_SYMBOL (coding_type);
10093 ASET (attrs, coding_attr_type, coding_type);
10095 charset_list = args[coding_arg_charset_list];
10096 if (SYMBOLP (charset_list))
10098 if (EQ (charset_list, Qiso_2022))
10100 if (! EQ (coding_type, Qiso_2022))
10101 error ("Invalid charset-list");
10102 charset_list = Viso_2022_charset_list;
10104 else if (EQ (charset_list, Qemacs_mule))
10106 if (! EQ (coding_type, Qemacs_mule))
10107 error ("Invalid charset-list");
10108 charset_list = Vemacs_mule_charset_list;
10110 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10112 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10113 error ("Invalid charset-list");
10114 if (max_charset_id < XFASTINT (XCAR (tail)))
10115 max_charset_id = XFASTINT (XCAR (tail));
10118 else
10120 charset_list = Fcopy_sequence (charset_list);
10121 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10123 struct charset *charset;
10125 val = XCAR (tail);
10126 CHECK_CHARSET_GET_CHARSET (val, charset);
10127 if (EQ (coding_type, Qiso_2022)
10128 ? CHARSET_ISO_FINAL (charset) < 0
10129 : EQ (coding_type, Qemacs_mule)
10130 ? CHARSET_EMACS_MULE_ID (charset) < 0
10131 : 0)
10132 error ("Can't handle charset `%s'",
10133 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10135 XSETCAR (tail, make_number (charset->id));
10136 if (max_charset_id < charset->id)
10137 max_charset_id = charset->id;
10140 ASET (attrs, coding_attr_charset_list, charset_list);
10142 safe_charsets = make_uninit_string (max_charset_id + 1);
10143 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10144 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10145 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10146 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10148 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10150 val = args[coding_arg_decode_translation_table];
10151 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10152 CHECK_SYMBOL (val);
10153 ASET (attrs, coding_attr_decode_tbl, val);
10155 val = args[coding_arg_encode_translation_table];
10156 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10157 CHECK_SYMBOL (val);
10158 ASET (attrs, coding_attr_encode_tbl, val);
10160 val = args[coding_arg_post_read_conversion];
10161 CHECK_SYMBOL (val);
10162 ASET (attrs, coding_attr_post_read, val);
10164 val = args[coding_arg_pre_write_conversion];
10165 CHECK_SYMBOL (val);
10166 ASET (attrs, coding_attr_pre_write, val);
10168 val = args[coding_arg_default_char];
10169 if (NILP (val))
10170 ASET (attrs, coding_attr_default_char, make_number (' '));
10171 else
10173 CHECK_CHARACTER (val);
10174 ASET (attrs, coding_attr_default_char, val);
10177 val = args[coding_arg_for_unibyte];
10178 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10180 val = args[coding_arg_plist];
10181 CHECK_LIST (val);
10182 ASET (attrs, coding_attr_plist, val);
10184 if (EQ (coding_type, Qcharset))
10186 /* Generate a lisp vector of 256 elements. Each element is nil,
10187 integer, or a list of charset IDs.
10189 If Nth element is nil, the byte code N is invalid in this
10190 coding system.
10192 If Nth element is a number NUM, N is the first byte of a
10193 charset whose ID is NUM.
10195 If Nth element is a list of charset IDs, N is the first byte
10196 of one of them. The list is sorted by dimensions of the
10197 charsets. A charset of smaller dimension comes first. */
10198 val = Fmake_vector (make_number (256), Qnil);
10200 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10202 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10203 int dim = CHARSET_DIMENSION (charset);
10204 int idx = (dim - 1) * 4;
10206 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10207 ASET (attrs, coding_attr_ascii_compat, Qt);
10209 for (i = charset->code_space[idx];
10210 i <= charset->code_space[idx + 1]; i++)
10212 Lisp_Object tmp, tmp2;
10213 int dim2;
10215 tmp = AREF (val, i);
10216 if (NILP (tmp))
10217 tmp = XCAR (tail);
10218 else if (NUMBERP (tmp))
10220 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10221 if (dim < dim2)
10222 tmp = list2 (XCAR (tail), tmp);
10223 else
10224 tmp = list2 (tmp, XCAR (tail));
10226 else
10228 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10230 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10231 if (dim < dim2)
10232 break;
10234 if (NILP (tmp2))
10235 tmp = nconc2 (tmp, list1 (XCAR (tail)));
10236 else
10238 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10239 XSETCAR (tmp2, XCAR (tail));
10242 ASET (val, i, tmp);
10245 ASET (attrs, coding_attr_charset_valids, val);
10246 category = coding_category_charset;
10248 else if (EQ (coding_type, Qccl))
10250 Lisp_Object valids;
10252 if (nargs < coding_arg_ccl_max)
10253 goto short_args;
10255 val = args[coding_arg_ccl_decoder];
10256 CHECK_CCL_PROGRAM (val);
10257 if (VECTORP (val))
10258 val = Fcopy_sequence (val);
10259 ASET (attrs, coding_attr_ccl_decoder, val);
10261 val = args[coding_arg_ccl_encoder];
10262 CHECK_CCL_PROGRAM (val);
10263 if (VECTORP (val))
10264 val = Fcopy_sequence (val);
10265 ASET (attrs, coding_attr_ccl_encoder, val);
10267 val = args[coding_arg_ccl_valids];
10268 valids = Fmake_string (make_number (256), make_number (0));
10269 for (tail = val; CONSP (tail); tail = XCDR (tail))
10271 int from, to;
10273 val = XCAR (tail);
10274 if (INTEGERP (val))
10276 if (! (0 <= XINT (val) && XINT (val) <= 255))
10277 args_out_of_range_3 (val, make_number (0), make_number (255));
10278 from = to = XINT (val);
10280 else
10282 CHECK_CONS (val);
10283 CHECK_NATNUM_CAR (val);
10284 CHECK_NUMBER_CDR (val);
10285 if (XINT (XCAR (val)) > 255)
10286 args_out_of_range_3 (XCAR (val),
10287 make_number (0), make_number (255));
10288 from = XINT (XCAR (val));
10289 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10290 args_out_of_range_3 (XCDR (val),
10291 XCAR (val), make_number (255));
10292 to = XINT (XCDR (val));
10294 for (i = from; i <= to; i++)
10295 SSET (valids, i, 1);
10297 ASET (attrs, coding_attr_ccl_valids, valids);
10299 category = coding_category_ccl;
10301 else if (EQ (coding_type, Qutf_16))
10303 Lisp_Object bom, endian;
10305 ASET (attrs, coding_attr_ascii_compat, Qnil);
10307 if (nargs < coding_arg_utf16_max)
10308 goto short_args;
10310 bom = args[coding_arg_utf16_bom];
10311 if (! NILP (bom) && ! EQ (bom, Qt))
10313 CHECK_CONS (bom);
10314 val = XCAR (bom);
10315 CHECK_CODING_SYSTEM (val);
10316 val = XCDR (bom);
10317 CHECK_CODING_SYSTEM (val);
10319 ASET (attrs, coding_attr_utf_bom, bom);
10321 endian = args[coding_arg_utf16_endian];
10322 CHECK_SYMBOL (endian);
10323 if (NILP (endian))
10324 endian = Qbig;
10325 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10326 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10327 ASET (attrs, coding_attr_utf_16_endian, endian);
10329 category = (CONSP (bom)
10330 ? coding_category_utf_16_auto
10331 : NILP (bom)
10332 ? (EQ (endian, Qbig)
10333 ? coding_category_utf_16_be_nosig
10334 : coding_category_utf_16_le_nosig)
10335 : (EQ (endian, Qbig)
10336 ? coding_category_utf_16_be
10337 : coding_category_utf_16_le));
10339 else if (EQ (coding_type, Qiso_2022))
10341 Lisp_Object initial, reg_usage, request, flags;
10343 if (nargs < coding_arg_iso2022_max)
10344 goto short_args;
10346 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10347 CHECK_VECTOR (initial);
10348 for (i = 0; i < 4; i++)
10350 val = AREF (initial, i);
10351 if (! NILP (val))
10353 struct charset *charset;
10355 CHECK_CHARSET_GET_CHARSET (val, charset);
10356 ASET (initial, i, make_number (CHARSET_ID (charset)));
10357 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10358 ASET (attrs, coding_attr_ascii_compat, Qt);
10360 else
10361 ASET (initial, i, make_number (-1));
10364 reg_usage = args[coding_arg_iso2022_reg_usage];
10365 CHECK_CONS (reg_usage);
10366 CHECK_NUMBER_CAR (reg_usage);
10367 CHECK_NUMBER_CDR (reg_usage);
10369 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10370 for (tail = request; CONSP (tail); tail = XCDR (tail))
10372 int id;
10373 Lisp_Object tmp1;
10375 val = XCAR (tail);
10376 CHECK_CONS (val);
10377 tmp1 = XCAR (val);
10378 CHECK_CHARSET_GET_ID (tmp1, id);
10379 CHECK_NATNUM_CDR (val);
10380 if (XINT (XCDR (val)) >= 4)
10381 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10382 XSETCAR (val, make_number (id));
10385 flags = args[coding_arg_iso2022_flags];
10386 CHECK_NATNUM (flags);
10387 i = XINT (flags) & INT_MAX;
10388 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10389 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10390 flags = make_number (i);
10392 ASET (attrs, coding_attr_iso_initial, initial);
10393 ASET (attrs, coding_attr_iso_usage, reg_usage);
10394 ASET (attrs, coding_attr_iso_request, request);
10395 ASET (attrs, coding_attr_iso_flags, flags);
10396 setup_iso_safe_charsets (attrs);
10398 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10399 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10400 | CODING_ISO_FLAG_SINGLE_SHIFT))
10401 ? coding_category_iso_7_else
10402 : EQ (args[coding_arg_charset_list], Qiso_2022)
10403 ? coding_category_iso_7
10404 : coding_category_iso_7_tight);
10405 else
10407 int id = XINT (AREF (initial, 1));
10409 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10410 || EQ (args[coding_arg_charset_list], Qiso_2022)
10411 || id < 0)
10412 ? coding_category_iso_8_else
10413 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10414 ? coding_category_iso_8_1
10415 : coding_category_iso_8_2);
10417 if (category != coding_category_iso_8_1
10418 && category != coding_category_iso_8_2)
10419 ASET (attrs, coding_attr_ascii_compat, Qnil);
10421 else if (EQ (coding_type, Qemacs_mule))
10423 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10424 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10425 ASET (attrs, coding_attr_ascii_compat, Qt);
10426 category = coding_category_emacs_mule;
10428 else if (EQ (coding_type, Qshift_jis))
10431 struct charset *charset;
10433 if (XINT (Flength (charset_list)) != 3
10434 && XINT (Flength (charset_list)) != 4)
10435 error ("There should be three or four charsets");
10437 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10438 if (CHARSET_DIMENSION (charset) != 1)
10439 error ("Dimension of charset %s is not one",
10440 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10441 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10442 ASET (attrs, coding_attr_ascii_compat, Qt);
10444 charset_list = XCDR (charset_list);
10445 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10446 if (CHARSET_DIMENSION (charset) != 1)
10447 error ("Dimension of charset %s is not one",
10448 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10450 charset_list = XCDR (charset_list);
10451 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10452 if (CHARSET_DIMENSION (charset) != 2)
10453 error ("Dimension of charset %s is not two",
10454 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10456 charset_list = XCDR (charset_list);
10457 if (! NILP (charset_list))
10459 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10460 if (CHARSET_DIMENSION (charset) != 2)
10461 error ("Dimension of charset %s is not two",
10462 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10465 category = coding_category_sjis;
10466 Vsjis_coding_system = name;
10468 else if (EQ (coding_type, Qbig5))
10470 struct charset *charset;
10472 if (XINT (Flength (charset_list)) != 2)
10473 error ("There should be just two charsets");
10475 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10476 if (CHARSET_DIMENSION (charset) != 1)
10477 error ("Dimension of charset %s is not one",
10478 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10479 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10480 ASET (attrs, coding_attr_ascii_compat, Qt);
10482 charset_list = XCDR (charset_list);
10483 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10484 if (CHARSET_DIMENSION (charset) != 2)
10485 error ("Dimension of charset %s is not two",
10486 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10488 category = coding_category_big5;
10489 Vbig5_coding_system = name;
10491 else if (EQ (coding_type, Qraw_text))
10493 category = coding_category_raw_text;
10494 ASET (attrs, coding_attr_ascii_compat, Qt);
10496 else if (EQ (coding_type, Qutf_8))
10498 Lisp_Object bom;
10500 if (nargs < coding_arg_utf8_max)
10501 goto short_args;
10503 bom = args[coding_arg_utf8_bom];
10504 if (! NILP (bom) && ! EQ (bom, Qt))
10506 CHECK_CONS (bom);
10507 val = XCAR (bom);
10508 CHECK_CODING_SYSTEM (val);
10509 val = XCDR (bom);
10510 CHECK_CODING_SYSTEM (val);
10512 ASET (attrs, coding_attr_utf_bom, bom);
10513 if (NILP (bom))
10514 ASET (attrs, coding_attr_ascii_compat, Qt);
10516 category = (CONSP (bom) ? coding_category_utf_8_auto
10517 : NILP (bom) ? coding_category_utf_8_nosig
10518 : coding_category_utf_8_sig);
10520 else if (EQ (coding_type, Qundecided))
10522 if (nargs < coding_arg_undecided_max)
10523 goto short_args;
10524 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10525 args[coding_arg_undecided_inhibit_null_byte_detection]);
10526 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10527 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10528 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10529 args[coding_arg_undecided_prefer_utf_8]);
10530 category = coding_category_undecided;
10532 else
10533 error ("Invalid coding system type: %s",
10534 SDATA (SYMBOL_NAME (coding_type)));
10536 ASET (attrs, coding_attr_category, make_number (category));
10537 ASET (attrs, coding_attr_plist,
10538 Fcons (QCcategory,
10539 Fcons (AREF (Vcoding_category_table, category),
10540 CODING_ATTR_PLIST (attrs))));
10541 ASET (attrs, coding_attr_plist,
10542 Fcons (QCascii_compatible_p,
10543 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10544 CODING_ATTR_PLIST (attrs))));
10546 eol_type = args[coding_arg_eol_type];
10547 if (! NILP (eol_type)
10548 && ! EQ (eol_type, Qunix)
10549 && ! EQ (eol_type, Qdos)
10550 && ! EQ (eol_type, Qmac))
10551 error ("Invalid eol-type");
10553 aliases = list1 (name);
10555 if (NILP (eol_type))
10557 eol_type = make_subsidiaries (name);
10558 for (i = 0; i < 3; i++)
10560 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10562 this_name = AREF (eol_type, i);
10563 this_aliases = list1 (this_name);
10564 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10565 this_spec = make_uninit_vector (3);
10566 ASET (this_spec, 0, attrs);
10567 ASET (this_spec, 1, this_aliases);
10568 ASET (this_spec, 2, this_eol_type);
10569 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10570 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10571 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10572 if (NILP (val))
10573 Vcoding_system_alist
10574 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10575 Vcoding_system_alist);
10579 spec_vec = make_uninit_vector (3);
10580 ASET (spec_vec, 0, attrs);
10581 ASET (spec_vec, 1, aliases);
10582 ASET (spec_vec, 2, eol_type);
10584 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10585 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10586 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10587 if (NILP (val))
10588 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10589 Vcoding_system_alist);
10592 int id = coding_categories[category].id;
10594 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10595 setup_coding_system (name, &coding_categories[category]);
10598 return Qnil;
10600 short_args:
10601 return Fsignal (Qwrong_number_of_arguments,
10602 Fcons (intern ("define-coding-system-internal"),
10603 make_number (nargs)));
10607 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10608 3, 3, 0,
10609 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10610 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10612 Lisp_Object spec, attrs;
10614 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10615 attrs = AREF (spec, 0);
10616 if (EQ (prop, QCmnemonic))
10618 if (! STRINGP (val))
10619 CHECK_CHARACTER (val);
10620 ASET (attrs, coding_attr_mnemonic, val);
10622 else if (EQ (prop, QCdefault_char))
10624 if (NILP (val))
10625 val = make_number (' ');
10626 else
10627 CHECK_CHARACTER (val);
10628 ASET (attrs, coding_attr_default_char, val);
10630 else if (EQ (prop, QCdecode_translation_table))
10632 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10633 CHECK_SYMBOL (val);
10634 ASET (attrs, coding_attr_decode_tbl, val);
10636 else if (EQ (prop, QCencode_translation_table))
10638 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10639 CHECK_SYMBOL (val);
10640 ASET (attrs, coding_attr_encode_tbl, val);
10642 else if (EQ (prop, QCpost_read_conversion))
10644 CHECK_SYMBOL (val);
10645 ASET (attrs, coding_attr_post_read, val);
10647 else if (EQ (prop, QCpre_write_conversion))
10649 CHECK_SYMBOL (val);
10650 ASET (attrs, coding_attr_pre_write, val);
10652 else if (EQ (prop, QCascii_compatible_p))
10654 ASET (attrs, coding_attr_ascii_compat, val);
10657 ASET (attrs, coding_attr_plist,
10658 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10659 return val;
10663 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10664 Sdefine_coding_system_alias, 2, 2, 0,
10665 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10666 (Lisp_Object alias, Lisp_Object coding_system)
10668 Lisp_Object spec, aliases, eol_type, val;
10670 CHECK_SYMBOL (alias);
10671 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10672 aliases = AREF (spec, 1);
10673 /* ALIASES should be a list of length more than zero, and the first
10674 element is a base coding system. Append ALIAS at the tail of the
10675 list. */
10676 while (!NILP (XCDR (aliases)))
10677 aliases = XCDR (aliases);
10678 XSETCDR (aliases, list1 (alias));
10680 eol_type = AREF (spec, 2);
10681 if (VECTORP (eol_type))
10683 Lisp_Object subsidiaries;
10684 int i;
10686 subsidiaries = make_subsidiaries (alias);
10687 for (i = 0; i < 3; i++)
10688 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10689 AREF (eol_type, i));
10692 Fputhash (alias, spec, Vcoding_system_hash_table);
10693 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10694 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10695 if (NILP (val))
10696 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10697 Vcoding_system_alist);
10699 return Qnil;
10702 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10703 1, 1, 0,
10704 doc: /* Return the base of CODING-SYSTEM.
10705 Any alias or subsidiary coding system is not a base coding system. */)
10706 (Lisp_Object coding_system)
10708 Lisp_Object spec, attrs;
10710 if (NILP (coding_system))
10711 return (Qno_conversion);
10712 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10713 attrs = AREF (spec, 0);
10714 return CODING_ATTR_BASE_NAME (attrs);
10717 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10718 1, 1, 0,
10719 doc: "Return the property list of CODING-SYSTEM.")
10720 (Lisp_Object coding_system)
10722 Lisp_Object spec, attrs;
10724 if (NILP (coding_system))
10725 coding_system = Qno_conversion;
10726 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10727 attrs = AREF (spec, 0);
10728 return CODING_ATTR_PLIST (attrs);
10732 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10733 1, 1, 0,
10734 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10735 (Lisp_Object coding_system)
10737 Lisp_Object spec;
10739 if (NILP (coding_system))
10740 coding_system = Qno_conversion;
10741 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10742 return AREF (spec, 1);
10745 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10746 Scoding_system_eol_type, 1, 1, 0,
10747 doc: /* Return eol-type of CODING-SYSTEM.
10748 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10750 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10751 and CR respectively.
10753 A vector value indicates that a format of end-of-line should be
10754 detected automatically. Nth element of the vector is the subsidiary
10755 coding system whose eol-type is N. */)
10756 (Lisp_Object coding_system)
10758 Lisp_Object spec, eol_type;
10759 int n;
10761 if (NILP (coding_system))
10762 coding_system = Qno_conversion;
10763 if (! CODING_SYSTEM_P (coding_system))
10764 return Qnil;
10765 spec = CODING_SYSTEM_SPEC (coding_system);
10766 eol_type = AREF (spec, 2);
10767 if (VECTORP (eol_type))
10768 return Fcopy_sequence (eol_type);
10769 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10770 return make_number (n);
10773 #endif /* emacs */
10776 /*** 9. Post-amble ***/
10778 void
10779 init_coding_once (void)
10781 int i;
10783 for (i = 0; i < coding_category_max; i++)
10785 coding_categories[i].id = -1;
10786 coding_priorities[i] = i;
10789 /* ISO2022 specific initialize routine. */
10790 for (i = 0; i < 0x20; i++)
10791 iso_code_class[i] = ISO_control_0;
10792 for (i = 0x21; i < 0x7F; i++)
10793 iso_code_class[i] = ISO_graphic_plane_0;
10794 for (i = 0x80; i < 0xA0; i++)
10795 iso_code_class[i] = ISO_control_1;
10796 for (i = 0xA1; i < 0xFF; i++)
10797 iso_code_class[i] = ISO_graphic_plane_1;
10798 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10799 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10800 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10801 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10802 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10803 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10804 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10805 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10806 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10808 for (i = 0; i < 256; i++)
10810 emacs_mule_bytes[i] = 1;
10812 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10813 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10814 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10815 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10818 #ifdef emacs
10820 void
10821 syms_of_coding (void)
10823 staticpro (&Vcoding_system_hash_table);
10825 Lisp_Object args[2];
10826 args[0] = QCtest;
10827 args[1] = Qeq;
10828 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10831 staticpro (&Vsjis_coding_system);
10832 Vsjis_coding_system = Qnil;
10834 staticpro (&Vbig5_coding_system);
10835 Vbig5_coding_system = Qnil;
10837 staticpro (&Vcode_conversion_reused_workbuf);
10838 Vcode_conversion_reused_workbuf = Qnil;
10840 staticpro (&Vcode_conversion_workbuf_name);
10841 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10843 reused_workbuf_in_use = 0;
10845 DEFSYM (Qcharset, "charset");
10846 DEFSYM (Qtarget_idx, "target-idx");
10847 DEFSYM (Qcoding_system_history, "coding-system-history");
10848 Fset (Qcoding_system_history, Qnil);
10850 /* Target FILENAME is the first argument. */
10851 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10852 /* Target FILENAME is the third argument. */
10853 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10855 DEFSYM (Qcall_process, "call-process");
10856 /* Target PROGRAM is the first argument. */
10857 Fput (Qcall_process, Qtarget_idx, make_number (0));
10859 DEFSYM (Qcall_process_region, "call-process-region");
10860 /* Target PROGRAM is the third argument. */
10861 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10863 DEFSYM (Qstart_process, "start-process");
10864 /* Target PROGRAM is the third argument. */
10865 Fput (Qstart_process, Qtarget_idx, make_number (2));
10867 DEFSYM (Qopen_network_stream, "open-network-stream");
10868 /* Target SERVICE is the fourth argument. */
10869 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10871 DEFSYM (Qcoding_system, "coding-system");
10872 DEFSYM (Qcoding_aliases, "coding-aliases");
10874 DEFSYM (Qeol_type, "eol-type");
10875 DEFSYM (Qunix, "unix");
10876 DEFSYM (Qdos, "dos");
10877 DEFSYM (Qmac, "mac");
10879 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10880 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10881 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10882 DEFSYM (Qdefault_char, "default-char");
10883 DEFSYM (Qundecided, "undecided");
10884 DEFSYM (Qno_conversion, "no-conversion");
10885 DEFSYM (Qraw_text, "raw-text");
10887 DEFSYM (Qiso_2022, "iso-2022");
10889 DEFSYM (Qutf_8, "utf-8");
10890 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10892 #if defined (WINDOWSNT) || defined (CYGWIN)
10893 /* No, not utf-16-le: that one has a BOM. */
10894 DEFSYM (Qutf_16le, "utf-16le");
10895 #endif
10897 DEFSYM (Qutf_16, "utf-16");
10898 DEFSYM (Qbig, "big");
10899 DEFSYM (Qlittle, "little");
10901 DEFSYM (Qshift_jis, "shift-jis");
10902 DEFSYM (Qbig5, "big5");
10904 DEFSYM (Qcoding_system_p, "coding-system-p");
10906 DEFSYM (Qcoding_system_error, "coding-system-error");
10907 Fput (Qcoding_system_error, Qerror_conditions,
10908 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10909 Fput (Qcoding_system_error, Qerror_message,
10910 build_pure_c_string ("Invalid coding system"));
10912 DEFSYM (Qtranslation_table, "translation-table");
10913 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10914 DEFSYM (Qtranslation_table_id, "translation-table-id");
10915 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10916 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10918 DEFSYM (Qvalid_codes, "valid-codes");
10920 DEFSYM (Qemacs_mule, "emacs-mule");
10922 DEFSYM (QCcategory, ":category");
10923 DEFSYM (QCmnemonic, ":mnemonic");
10924 DEFSYM (QCdefault_char, ":default-char");
10925 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10926 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10927 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10928 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10929 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10931 Vcoding_category_table
10932 = Fmake_vector (make_number (coding_category_max), Qnil);
10933 staticpro (&Vcoding_category_table);
10934 /* Followings are target of code detection. */
10935 ASET (Vcoding_category_table, coding_category_iso_7,
10936 intern_c_string ("coding-category-iso-7"));
10937 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10938 intern_c_string ("coding-category-iso-7-tight"));
10939 ASET (Vcoding_category_table, coding_category_iso_8_1,
10940 intern_c_string ("coding-category-iso-8-1"));
10941 ASET (Vcoding_category_table, coding_category_iso_8_2,
10942 intern_c_string ("coding-category-iso-8-2"));
10943 ASET (Vcoding_category_table, coding_category_iso_7_else,
10944 intern_c_string ("coding-category-iso-7-else"));
10945 ASET (Vcoding_category_table, coding_category_iso_8_else,
10946 intern_c_string ("coding-category-iso-8-else"));
10947 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10948 intern_c_string ("coding-category-utf-8-auto"));
10949 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10950 intern_c_string ("coding-category-utf-8"));
10951 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10952 intern_c_string ("coding-category-utf-8-sig"));
10953 ASET (Vcoding_category_table, coding_category_utf_16_be,
10954 intern_c_string ("coding-category-utf-16-be"));
10955 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10956 intern_c_string ("coding-category-utf-16-auto"));
10957 ASET (Vcoding_category_table, coding_category_utf_16_le,
10958 intern_c_string ("coding-category-utf-16-le"));
10959 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10960 intern_c_string ("coding-category-utf-16-be-nosig"));
10961 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10962 intern_c_string ("coding-category-utf-16-le-nosig"));
10963 ASET (Vcoding_category_table, coding_category_charset,
10964 intern_c_string ("coding-category-charset"));
10965 ASET (Vcoding_category_table, coding_category_sjis,
10966 intern_c_string ("coding-category-sjis"));
10967 ASET (Vcoding_category_table, coding_category_big5,
10968 intern_c_string ("coding-category-big5"));
10969 ASET (Vcoding_category_table, coding_category_ccl,
10970 intern_c_string ("coding-category-ccl"));
10971 ASET (Vcoding_category_table, coding_category_emacs_mule,
10972 intern_c_string ("coding-category-emacs-mule"));
10973 /* Followings are NOT target of code detection. */
10974 ASET (Vcoding_category_table, coding_category_raw_text,
10975 intern_c_string ("coding-category-raw-text"));
10976 ASET (Vcoding_category_table, coding_category_undecided,
10977 intern_c_string ("coding-category-undecided"));
10979 DEFSYM (Qinsufficient_source, "insufficient-source");
10980 DEFSYM (Qinvalid_source, "invalid-source");
10981 DEFSYM (Qinterrupted, "interrupted");
10982 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10984 defsubr (&Scoding_system_p);
10985 defsubr (&Sread_coding_system);
10986 defsubr (&Sread_non_nil_coding_system);
10987 defsubr (&Scheck_coding_system);
10988 defsubr (&Sdetect_coding_region);
10989 defsubr (&Sdetect_coding_string);
10990 defsubr (&Sfind_coding_systems_region_internal);
10991 defsubr (&Sunencodable_char_position);
10992 defsubr (&Scheck_coding_systems_region);
10993 defsubr (&Sdecode_coding_region);
10994 defsubr (&Sencode_coding_region);
10995 defsubr (&Sdecode_coding_string);
10996 defsubr (&Sencode_coding_string);
10997 defsubr (&Sdecode_sjis_char);
10998 defsubr (&Sencode_sjis_char);
10999 defsubr (&Sdecode_big5_char);
11000 defsubr (&Sencode_big5_char);
11001 defsubr (&Sset_terminal_coding_system_internal);
11002 defsubr (&Sset_safe_terminal_coding_system_internal);
11003 defsubr (&Sterminal_coding_system);
11004 defsubr (&Sset_keyboard_coding_system_internal);
11005 defsubr (&Skeyboard_coding_system);
11006 defsubr (&Sfind_operation_coding_system);
11007 defsubr (&Sset_coding_system_priority);
11008 defsubr (&Sdefine_coding_system_internal);
11009 defsubr (&Sdefine_coding_system_alias);
11010 defsubr (&Scoding_system_put);
11011 defsubr (&Scoding_system_base);
11012 defsubr (&Scoding_system_plist);
11013 defsubr (&Scoding_system_aliases);
11014 defsubr (&Scoding_system_eol_type);
11015 defsubr (&Scoding_system_priority_list);
11017 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11018 doc: /* List of coding systems.
11020 Do not alter the value of this variable manually. This variable should be
11021 updated by the functions `define-coding-system' and
11022 `define-coding-system-alias'. */);
11023 Vcoding_system_list = Qnil;
11025 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11026 doc: /* Alist of coding system names.
11027 Each element is one element list of coding system name.
11028 This variable is given to `completing-read' as COLLECTION argument.
11030 Do not alter the value of this variable manually. This variable should be
11031 updated by the functions `make-coding-system' and
11032 `define-coding-system-alias'. */);
11033 Vcoding_system_alist = Qnil;
11035 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11036 doc: /* List of coding-categories (symbols) ordered by priority.
11038 On detecting a coding system, Emacs tries code detection algorithms
11039 associated with each coding-category one by one in this order. When
11040 one algorithm agrees with a byte sequence of source text, the coding
11041 system bound to the corresponding coding-category is selected.
11043 Don't modify this variable directly, but use `set-coding-system-priority'. */);
11045 int i;
11047 Vcoding_category_list = Qnil;
11048 for (i = coding_category_max - 1; i >= 0; i--)
11049 Vcoding_category_list
11050 = Fcons (AREF (Vcoding_category_table, i),
11051 Vcoding_category_list);
11054 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11055 doc: /* Specify the coding system for read operations.
11056 It is useful to bind this variable with `let', but do not set it globally.
11057 If the value is a coding system, it is used for decoding on read operation.
11058 If not, an appropriate element is used from one of the coding system alists.
11059 There are three such tables: `file-coding-system-alist',
11060 `process-coding-system-alist', and `network-coding-system-alist'. */);
11061 Vcoding_system_for_read = Qnil;
11063 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11064 doc: /* Specify the coding system for write operations.
11065 Programs bind this variable with `let', but you should not set it globally.
11066 If the value is a coding system, it is used for encoding of output,
11067 when writing it to a file and when sending it to a file or subprocess.
11069 If this does not specify a coding system, an appropriate element
11070 is used from one of the coding system alists.
11071 There are three such tables: `file-coding-system-alist',
11072 `process-coding-system-alist', and `network-coding-system-alist'.
11073 For output to files, if the above procedure does not specify a coding system,
11074 the value of `buffer-file-coding-system' is used. */);
11075 Vcoding_system_for_write = Qnil;
11077 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11078 doc: /*
11079 Coding system used in the latest file or process I/O. */);
11080 Vlast_coding_system_used = Qnil;
11082 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11083 doc: /*
11084 Error status of the last code conversion.
11086 When an error was detected in the last code conversion, this variable
11087 is set to one of the following symbols.
11088 `insufficient-source'
11089 `inconsistent-eol'
11090 `invalid-source'
11091 `interrupted'
11092 `insufficient-memory'
11093 When no error was detected, the value doesn't change. So, to check
11094 the error status of a code conversion by this variable, you must
11095 explicitly set this variable to nil before performing code
11096 conversion. */);
11097 Vlast_code_conversion_error = Qnil;
11099 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11100 doc: /*
11101 *Non-nil means always inhibit code conversion of end-of-line format.
11102 See info node `Coding Systems' and info node `Text and Binary' concerning
11103 such conversion. */);
11104 inhibit_eol_conversion = 0;
11106 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11107 doc: /*
11108 Non-nil means process buffer inherits coding system of process output.
11109 Bind it to t if the process output is to be treated as if it were a file
11110 read from some filesystem. */);
11111 inherit_process_coding_system = 0;
11113 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11114 doc: /*
11115 Alist to decide a coding system to use for a file I/O operation.
11116 The format is ((PATTERN . VAL) ...),
11117 where PATTERN is a regular expression matching a file name,
11118 VAL is a coding system, a cons of coding systems, or a function symbol.
11119 If VAL is a coding system, it is used for both decoding and encoding
11120 the file contents.
11121 If VAL is a cons of coding systems, the car part is used for decoding,
11122 and the cdr part is used for encoding.
11123 If VAL is a function symbol, the function must return a coding system
11124 or a cons of coding systems which are used as above. The function is
11125 called with an argument that is a list of the arguments with which
11126 `find-operation-coding-system' was called. If the function can't decide
11127 a coding system, it can return `undecided' so that the normal
11128 code-detection is performed.
11130 See also the function `find-operation-coding-system'
11131 and the variable `auto-coding-alist'. */);
11132 Vfile_coding_system_alist = Qnil;
11134 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11135 doc: /*
11136 Alist to decide a coding system to use for a process I/O operation.
11137 The format is ((PATTERN . VAL) ...),
11138 where PATTERN is a regular expression matching a program name,
11139 VAL is a coding system, a cons of coding systems, or a function symbol.
11140 If VAL is a coding system, it is used for both decoding what received
11141 from the program and encoding what sent to the program.
11142 If VAL is a cons of coding systems, the car part is used for decoding,
11143 and the cdr part is used for encoding.
11144 If VAL is a function symbol, the function must return a coding system
11145 or a cons of coding systems which are used as above.
11147 See also the function `find-operation-coding-system'. */);
11148 Vprocess_coding_system_alist = Qnil;
11150 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11151 doc: /*
11152 Alist to decide a coding system to use for a network I/O operation.
11153 The format is ((PATTERN . VAL) ...),
11154 where PATTERN is a regular expression matching a network service name
11155 or is a port number to connect to,
11156 VAL is a coding system, a cons of coding systems, or a function symbol.
11157 If VAL is a coding system, it is used for both decoding what received
11158 from the network stream and encoding what sent to the network stream.
11159 If VAL is a cons of coding systems, the car part is used for decoding,
11160 and the cdr part is used for encoding.
11161 If VAL is a function symbol, the function must return a coding system
11162 or a cons of coding systems which are used as above.
11164 See also the function `find-operation-coding-system'. */);
11165 Vnetwork_coding_system_alist = Qnil;
11167 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11168 doc: /* Coding system to use with system messages.
11169 Also used for decoding keyboard input on X Window system. */);
11170 Vlocale_coding_system = Qnil;
11172 /* The eol mnemonics are reset in startup.el system-dependently. */
11173 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11174 doc: /*
11175 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11176 eol_mnemonic_unix = build_pure_c_string (":");
11178 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11179 doc: /*
11180 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11181 eol_mnemonic_dos = build_pure_c_string ("\\");
11183 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11184 doc: /*
11185 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11186 eol_mnemonic_mac = build_pure_c_string ("/");
11188 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11189 doc: /*
11190 *String displayed in mode line when end-of-line format is not yet determined. */);
11191 eol_mnemonic_undecided = build_pure_c_string (":");
11193 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11194 doc: /*
11195 *Non-nil enables character translation while encoding and decoding. */);
11196 Venable_character_translation = Qt;
11198 DEFVAR_LISP ("standard-translation-table-for-decode",
11199 Vstandard_translation_table_for_decode,
11200 doc: /* Table for translating characters while decoding. */);
11201 Vstandard_translation_table_for_decode = Qnil;
11203 DEFVAR_LISP ("standard-translation-table-for-encode",
11204 Vstandard_translation_table_for_encode,
11205 doc: /* Table for translating characters while encoding. */);
11206 Vstandard_translation_table_for_encode = Qnil;
11208 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11209 doc: /* Alist of charsets vs revision numbers.
11210 While encoding, if a charset (car part of an element) is found,
11211 designate it with the escape sequence identifying revision (cdr part
11212 of the element). */);
11213 Vcharset_revision_table = Qnil;
11215 DEFVAR_LISP ("default-process-coding-system",
11216 Vdefault_process_coding_system,
11217 doc: /* Cons of coding systems used for process I/O by default.
11218 The car part is used for decoding a process output,
11219 the cdr part is used for encoding a text to be sent to a process. */);
11220 Vdefault_process_coding_system = Qnil;
11222 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11223 doc: /*
11224 Table of extra Latin codes in the range 128..159 (inclusive).
11225 This is a vector of length 256.
11226 If Nth element is non-nil, the existence of code N in a file
11227 \(or output of subprocess) doesn't prevent it to be detected as
11228 a coding system of ISO 2022 variant which has a flag
11229 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11230 or reading output of a subprocess.
11231 Only 128th through 159th elements have a meaning. */);
11232 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11234 DEFVAR_LISP ("select-safe-coding-system-function",
11235 Vselect_safe_coding_system_function,
11236 doc: /*
11237 Function to call to select safe coding system for encoding a text.
11239 If set, this function is called to force a user to select a proper
11240 coding system which can encode the text in the case that a default
11241 coding system used in each operation can't encode the text. The
11242 function should take care that the buffer is not modified while
11243 the coding system is being selected.
11245 The default value is `select-safe-coding-system' (which see). */);
11246 Vselect_safe_coding_system_function = Qnil;
11248 DEFVAR_BOOL ("coding-system-require-warning",
11249 coding_system_require_warning,
11250 doc: /* Internal use only.
11251 If non-nil, on writing a file, `select-safe-coding-system-function' is
11252 called even if `coding-system-for-write' is non-nil. The command
11253 `universal-coding-system-argument' binds this variable to t temporarily. */);
11254 coding_system_require_warning = 0;
11257 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11258 inhibit_iso_escape_detection,
11259 doc: /*
11260 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11262 When Emacs reads text, it tries to detect how the text is encoded.
11263 This code detection is sensitive to escape sequences. If Emacs sees
11264 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11265 of the ISO2022 encodings, and decodes text by the corresponding coding
11266 system (e.g. `iso-2022-7bit').
11268 However, there may be a case that you want to read escape sequences in
11269 a file as is. In such a case, you can set this variable to non-nil.
11270 Then the code detection will ignore any escape sequences, and no text is
11271 detected as encoded in some ISO-2022 encoding. The result is that all
11272 escape sequences become visible in a buffer.
11274 The default value is nil, and it is strongly recommended not to change
11275 it. That is because many Emacs Lisp source files that contain
11276 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11277 in Emacs's distribution, and they won't be decoded correctly on
11278 reading if you suppress escape sequence detection.
11280 The other way to read escape sequences in a file without decoding is
11281 to explicitly specify some coding system that doesn't use ISO-2022
11282 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11283 inhibit_iso_escape_detection = 0;
11285 DEFVAR_BOOL ("inhibit-null-byte-detection",
11286 inhibit_null_byte_detection,
11287 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11288 By default, Emacs treats it as binary data, and does not attempt to
11289 decode it. The effect is as if you specified `no-conversion' for
11290 reading that text.
11292 Set this to non-nil when a regular text happens to include null bytes.
11293 Examples are Index nodes of Info files and null-byte delimited output
11294 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11295 decode text as usual. */);
11296 inhibit_null_byte_detection = 0;
11298 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11299 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11300 Internal use only. Remove after the experimental optimizer becomes stable. */);
11301 disable_ascii_optimization = 0;
11303 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11304 doc: /* Char table for translating self-inserting characters.
11305 This is applied to the result of input methods, not their input.
11306 See also `keyboard-translate-table'.
11308 Use of this variable for character code unification was rendered
11309 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11310 internal character representation. */);
11311 Vtranslation_table_for_input = Qnil;
11314 Lisp_Object args[coding_arg_undecided_max];
11315 Lisp_Object plist[16];
11316 int i;
11318 for (i = 0; i < coding_arg_undecided_max; i++)
11319 args[i] = Qnil;
11321 plist[0] = intern_c_string (":name");
11322 plist[1] = args[coding_arg_name] = Qno_conversion;
11323 plist[2] = intern_c_string (":mnemonic");
11324 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11325 plist[4] = intern_c_string (":coding-type");
11326 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11327 plist[6] = intern_c_string (":ascii-compatible-p");
11328 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11329 plist[8] = intern_c_string (":default-char");
11330 plist[9] = args[coding_arg_default_char] = make_number (0);
11331 plist[10] = intern_c_string (":for-unibyte");
11332 plist[11] = args[coding_arg_for_unibyte] = Qt;
11333 plist[12] = intern_c_string (":docstring");
11334 plist[13] = build_pure_c_string ("Do no conversion.\n\
11336 When you visit a file with this coding, the file is read into a\n\
11337 unibyte buffer as is, thus each byte of a file is treated as a\n\
11338 character.");
11339 plist[14] = intern_c_string (":eol-type");
11340 plist[15] = args[coding_arg_eol_type] = Qunix;
11341 args[coding_arg_plist] = Flist (16, plist);
11342 Fdefine_coding_system_internal (coding_arg_max, args);
11344 plist[1] = args[coding_arg_name] = Qundecided;
11345 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11346 plist[5] = args[coding_arg_coding_type] = Qundecided;
11347 /* This is already set.
11348 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11349 plist[8] = intern_c_string (":charset-list");
11350 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11351 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11352 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11353 plist[15] = args[coding_arg_eol_type] = Qnil;
11354 args[coding_arg_plist] = Flist (16, plist);
11355 args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11356 args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11357 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11360 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11363 int i;
11365 for (i = 0; i < coding_category_max; i++)
11366 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11368 #if defined (DOS_NT)
11369 system_eol_type = Qdos;
11370 #else
11371 system_eol_type = Qunix;
11372 #endif
11373 staticpro (&system_eol_type);
11376 char *
11377 emacs_strerror (int error_number)
11379 char *str;
11381 synchronize_system_messages_locale ();
11382 str = strerror (error_number);
11384 if (! NILP (Vlocale_coding_system))
11386 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11387 Vlocale_coding_system,
11389 str = SSDATA (dec);
11392 return str;
11395 #endif /* emacs */