Timestamp fixes for undo.
[emacs.git] / src / coding.c
blob1ab59294b9889c862e3093daf4c5e9e15c9180f3
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
525 /* Index for each coding category in `coding_categories' */
527 enum coding_category
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
643 /*** Commonly used macros and functions ***/
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
652 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
654 static int
655 encode_inhibit_flag (Lisp_Object flag)
657 return NILP (flag) ? -1 : EQ (flag, Qt);
660 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
661 1 means yes, -1 means no, 0 means ask the user variable VAR. */
663 static bool
664 inhibit_flag (int encoded_flag, bool var)
666 return 0 < encoded_flag + var;
669 #define CODING_GET_INFO(coding, attrs, charset_list) \
670 do { \
671 (attrs) = CODING_ID_ATTRS ((coding)->id); \
672 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
673 } while (0)
675 static void
676 CHECK_NATNUM_CAR (Lisp_Object x)
678 Lisp_Object tmp = XCAR (x);
679 CHECK_NATNUM (tmp);
680 XSETCAR (x, tmp);
683 static void
684 CHECK_NATNUM_CDR (Lisp_Object x)
686 Lisp_Object tmp = XCDR (x);
687 CHECK_NATNUM (tmp);
688 XSETCDR (x, tmp);
692 /* Safely get one byte from the source text pointed by SRC which ends
693 at SRC_END, and set C to that byte. If there are not enough bytes
694 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
695 and a multibyte character is found at SRC, set C to the
696 negative value of the character code. The caller should declare
697 and set these variables appropriately in advance:
698 src, src_end, multibytep */
700 #define ONE_MORE_BYTE(c) \
701 do { \
702 if (src == src_end) \
704 if (src_base < src) \
705 record_conversion_result \
706 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
707 goto no_more_source; \
709 c = *src++; \
710 if (multibytep && (c & 0x80)) \
712 if ((c & 0xFE) == 0xC0) \
713 c = ((c & 1) << 6) | *src++; \
714 else \
716 src--; \
717 c = - string_char (src, &src, NULL); \
718 record_conversion_result \
719 (coding, CODING_RESULT_INVALID_SRC); \
722 consumed_chars++; \
723 } while (0)
725 /* Safely get two bytes from the source text pointed by SRC which ends
726 at SRC_END, and set C1 and C2 to those bytes while skipping the
727 heading multibyte characters. If there are not enough bytes in the
728 source, it jumps to 'no_more_source'. If MULTIBYTEP and
729 a multibyte character is found for C2, set C2 to the negative value
730 of the character code. The caller should declare and set these
731 variables appropriately in advance:
732 src, src_end, multibytep
733 It is intended that this macro is used in detect_coding_utf_16. */
735 #define TWO_MORE_BYTES(c1, c2) \
736 do { \
737 do { \
738 if (src == src_end) \
739 goto no_more_source; \
740 c1 = *src++; \
741 if (multibytep && (c1 & 0x80)) \
743 if ((c1 & 0xFE) == 0xC0) \
744 c1 = ((c1 & 1) << 6) | *src++; \
745 else \
747 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
748 c1 = -1; \
751 } while (c1 < 0); \
752 if (src == src_end) \
753 goto no_more_source; \
754 c2 = *src++; \
755 if (multibytep && (c2 & 0x80)) \
757 if ((c2 & 0xFE) == 0xC0) \
758 c2 = ((c2 & 1) << 6) | *src++; \
759 else \
760 c2 = -1; \
762 } while (0)
765 /* Store a byte C in the place pointed by DST and increment DST to the
766 next free point, and increment PRODUCED_CHARS. The caller should
767 assure that C is 0..127, and declare and set the variable `dst'
768 appropriately in advance.
772 #define EMIT_ONE_ASCII_BYTE(c) \
773 do { \
774 produced_chars++; \
775 *dst++ = (c); \
776 } while (0)
779 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
781 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
782 do { \
783 produced_chars += 2; \
784 *dst++ = (c1), *dst++ = (c2); \
785 } while (0)
788 /* Store a byte C in the place pointed by DST and increment DST to the
789 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
790 store in an appropriate multibyte form. The caller should
791 declare and set the variables `dst' and `multibytep' appropriately
792 in advance. */
794 #define EMIT_ONE_BYTE(c) \
795 do { \
796 produced_chars++; \
797 if (multibytep) \
799 unsigned ch = (c); \
800 if (ch >= 0x80) \
801 ch = BYTE8_TO_CHAR (ch); \
802 CHAR_STRING_ADVANCE (ch, dst); \
804 else \
805 *dst++ = (c); \
806 } while (0)
809 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
811 #define EMIT_TWO_BYTES(c1, c2) \
812 do { \
813 produced_chars += 2; \
814 if (multibytep) \
816 unsigned ch; \
818 ch = (c1); \
819 if (ch >= 0x80) \
820 ch = BYTE8_TO_CHAR (ch); \
821 CHAR_STRING_ADVANCE (ch, dst); \
822 ch = (c2); \
823 if (ch >= 0x80) \
824 ch = BYTE8_TO_CHAR (ch); \
825 CHAR_STRING_ADVANCE (ch, dst); \
827 else \
829 *dst++ = (c1); \
830 *dst++ = (c2); \
832 } while (0)
835 #define EMIT_THREE_BYTES(c1, c2, c3) \
836 do { \
837 EMIT_ONE_BYTE (c1); \
838 EMIT_TWO_BYTES (c2, c3); \
839 } while (0)
842 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
843 do { \
844 EMIT_TWO_BYTES (c1, c2); \
845 EMIT_TWO_BYTES (c3, c4); \
846 } while (0)
849 static void
850 record_conversion_result (struct coding_system *coding,
851 enum coding_result_code result)
853 coding->result = result;
854 switch (result)
856 case CODING_RESULT_INSUFFICIENT_SRC:
857 Vlast_code_conversion_error = Qinsufficient_source;
858 break;
859 case CODING_RESULT_INVALID_SRC:
860 Vlast_code_conversion_error = Qinvalid_source;
861 break;
862 case CODING_RESULT_INTERRUPT:
863 Vlast_code_conversion_error = Qinterrupted;
864 break;
865 case CODING_RESULT_INSUFFICIENT_DST:
866 /* Don't record this error in Vlast_code_conversion_error
867 because it happens just temporarily and is resolved when the
868 whole conversion is finished. */
869 break;
870 case CODING_RESULT_SUCCESS:
871 break;
872 default:
873 Vlast_code_conversion_error = intern ("Unknown error");
877 /* These wrapper macros are used to preserve validity of pointers into
878 buffer text across calls to decode_char, encode_char, etc, which
879 could cause relocation of buffers if it loads a charset map,
880 because loading a charset map allocates large structures. */
882 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
883 do { \
884 ptrdiff_t offset; \
886 charset_map_loaded = 0; \
887 c = DECODE_CHAR (charset, code); \
888 if (charset_map_loaded \
889 && (offset = coding_change_source (coding))) \
891 src += offset; \
892 src_base += offset; \
893 src_end += offset; \
895 } while (0)
897 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
898 do { \
899 ptrdiff_t offset; \
901 charset_map_loaded = 0; \
902 code = ENCODE_CHAR (charset, c); \
903 if (charset_map_loaded \
904 && (offset = coding_change_destination (coding))) \
906 dst += offset; \
907 dst_end += offset; \
909 } while (0)
911 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
912 do { \
913 ptrdiff_t offset; \
915 charset_map_loaded = 0; \
916 charset = char_charset (c, charset_list, code_return); \
917 if (charset_map_loaded \
918 && (offset = coding_change_destination (coding))) \
920 dst += offset; \
921 dst_end += offset; \
923 } while (0)
925 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
926 do { \
927 ptrdiff_t offset; \
929 charset_map_loaded = 0; \
930 result = CHAR_CHARSET_P (c, charset); \
931 if (charset_map_loaded \
932 && (offset = coding_change_destination (coding))) \
934 dst += offset; \
935 dst_end += offset; \
937 } while (0)
940 /* If there are at least BYTES length of room at dst, allocate memory
941 for coding->destination and update dst and dst_end. We don't have
942 to take care of coding->source which will be relocated. It is
943 handled by calling coding_set_source in encode_coding. */
945 #define ASSURE_DESTINATION(bytes) \
946 do { \
947 if (dst + (bytes) >= dst_end) \
949 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
951 dst = alloc_destination (coding, more_bytes, dst); \
952 dst_end = coding->destination + coding->dst_bytes; \
954 } while (0)
957 /* Store multibyte form of the character C in P, and advance P to the
958 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
959 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
960 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
962 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
964 /* Return the character code of character whose multibyte form is at
965 P, and advance P to the end of the multibyte form. This used to be
966 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
967 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
969 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
971 /* Set coding->source from coding->src_object. */
973 static void
974 coding_set_source (struct coding_system *coding)
976 if (BUFFERP (coding->src_object))
978 struct buffer *buf = XBUFFER (coding->src_object);
980 if (coding->src_pos < 0)
981 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
982 else
983 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
985 else if (STRINGP (coding->src_object))
987 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
989 else
991 /* Otherwise, the source is C string and is never relocated
992 automatically. Thus we don't have to update anything. */
997 /* Set coding->source from coding->src_object, and return how many
998 bytes coding->source was changed. */
1000 static ptrdiff_t
1001 coding_change_source (struct coding_system *coding)
1003 const unsigned char *orig = coding->source;
1004 coding_set_source (coding);
1005 return coding->source - orig;
1009 /* Set coding->destination from coding->dst_object. */
1011 static void
1012 coding_set_destination (struct coding_system *coding)
1014 if (BUFFERP (coding->dst_object))
1016 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1018 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1019 coding->dst_bytes = (GAP_END_ADDR
1020 - (coding->src_bytes - coding->consumed)
1021 - coding->destination);
1023 else
1025 /* We are sure that coding->dst_pos_byte is before the gap
1026 of the buffer. */
1027 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1028 + coding->dst_pos_byte - BEG_BYTE);
1029 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1030 - coding->destination);
1033 else
1035 /* Otherwise, the destination is C string and is never relocated
1036 automatically. Thus we don't have to update anything. */
1041 /* Set coding->destination from coding->dst_object, and return how
1042 many bytes coding->destination was changed. */
1044 static ptrdiff_t
1045 coding_change_destination (struct coding_system *coding)
1047 const unsigned char *orig = coding->destination;
1048 coding_set_destination (coding);
1049 return coding->destination - orig;
1053 static void
1054 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1056 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1057 string_overflow ();
1058 coding->destination = xrealloc (coding->destination,
1059 coding->dst_bytes + bytes);
1060 coding->dst_bytes += bytes;
1063 static void
1064 coding_alloc_by_making_gap (struct coding_system *coding,
1065 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1067 if (EQ (coding->src_object, coding->dst_object))
1069 /* The gap may contain the produced data at the head and not-yet
1070 consumed data at the tail. To preserve those data, we at
1071 first make the gap size to zero, then increase the gap
1072 size. */
1073 ptrdiff_t add = GAP_SIZE;
1075 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1076 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1077 make_gap (bytes);
1078 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1079 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1081 else
1082 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1086 static unsigned char *
1087 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1088 unsigned char *dst)
1090 ptrdiff_t offset = dst - coding->destination;
1092 if (BUFFERP (coding->dst_object))
1094 struct buffer *buf = XBUFFER (coding->dst_object);
1096 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1098 else
1099 coding_alloc_by_realloc (coding, nbytes);
1100 coding_set_destination (coding);
1101 dst = coding->destination + offset;
1102 return dst;
1105 /** Macros for annotations. */
1107 /* An annotation data is stored in the array coding->charbuf in this
1108 format:
1109 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1110 LENGTH is the number of elements in the annotation.
1111 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1112 NCHARS is the number of characters in the text annotated.
1114 The format of the following elements depend on ANNOTATION_MASK.
1116 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1117 follows:
1118 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1120 NBYTES is the number of bytes specified in the header part of
1121 old-style emacs-mule encoding, or 0 for the other kind of
1122 composition.
1124 METHOD is one of enum composition_method.
1126 Optional COMPOSITION-COMPONENTS are characters and composition
1127 rules.
1129 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1130 follows.
1132 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1133 recover from an invalid annotation, and should be skipped by
1134 produce_annotation. */
1136 /* Maximum length of the header of annotation data. */
1137 #define MAX_ANNOTATION_LENGTH 5
1139 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1140 do { \
1141 *(buf)++ = -(len); \
1142 *(buf)++ = (mask); \
1143 *(buf)++ = (nchars); \
1144 coding->annotated = 1; \
1145 } while (0);
1147 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1148 do { \
1149 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1150 *buf++ = nbytes; \
1151 *buf++ = method; \
1152 } while (0)
1155 #define ADD_CHARSET_DATA(buf, nchars, id) \
1156 do { \
1157 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1158 *buf++ = id; \
1159 } while (0)
1162 /* Bitmasks for coding->eol_seen. */
1164 #define EOL_SEEN_NONE 0
1165 #define EOL_SEEN_LF 1
1166 #define EOL_SEEN_CR 2
1167 #define EOL_SEEN_CRLF 4
1170 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1175 /*** 3. UTF-8 ***/
1177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1178 Return true if a text is encoded in UTF-8. */
1180 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1181 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1182 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1183 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1184 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1185 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1187 #define UTF_8_BOM_1 0xEF
1188 #define UTF_8_BOM_2 0xBB
1189 #define UTF_8_BOM_3 0xBF
1191 /* Unlike the other detect_coding_XXX, this function counts number of
1192 characters and check EOL format. */
1194 static bool
1195 detect_coding_utf_8 (struct coding_system *coding,
1196 struct coding_detection_info *detect_info)
1198 const unsigned char *src = coding->source, *src_base;
1199 const unsigned char *src_end = coding->source + coding->src_bytes;
1200 bool multibytep = coding->src_multibyte;
1201 ptrdiff_t consumed_chars = 0;
1202 bool bom_found = 0;
1203 int nchars = coding->head_ascii;
1204 int eol_seen = coding->eol_seen;
1206 detect_info->checked |= CATEGORY_MASK_UTF_8;
1207 /* A coding system of this category is always ASCII compatible. */
1208 src += nchars;
1210 if (src == coding->source /* BOM should be at the head. */
1211 && src + 3 < src_end /* BOM is 3-byte long. */
1212 && src[0] == UTF_8_BOM_1
1213 && src[1] == UTF_8_BOM_2
1214 && src[2] == UTF_8_BOM_3)
1216 bom_found = 1;
1217 src += 3;
1218 nchars++;
1221 while (1)
1223 int c, c1, c2, c3, c4;
1225 src_base = src;
1226 ONE_MORE_BYTE (c);
1227 if (c < 0 || UTF_8_1_OCTET_P (c))
1229 nchars++;
1230 if (c == '\r')
1232 if (src < src_end && *src == '\n')
1234 eol_seen |= EOL_SEEN_CRLF;
1235 src++;
1236 nchars++;
1238 else
1239 eol_seen |= EOL_SEEN_CR;
1241 else if (c == '\n')
1242 eol_seen |= EOL_SEEN_LF;
1243 continue;
1245 ONE_MORE_BYTE (c1);
1246 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1247 break;
1248 if (UTF_8_2_OCTET_LEADING_P (c))
1250 nchars++;
1251 continue;
1253 ONE_MORE_BYTE (c2);
1254 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1255 break;
1256 if (UTF_8_3_OCTET_LEADING_P (c))
1258 nchars++;
1259 continue;
1261 ONE_MORE_BYTE (c3);
1262 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1263 break;
1264 if (UTF_8_4_OCTET_LEADING_P (c))
1266 nchars++;
1267 continue;
1269 ONE_MORE_BYTE (c4);
1270 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1271 break;
1272 if (UTF_8_5_OCTET_LEADING_P (c))
1274 nchars++;
1275 continue;
1277 break;
1279 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1280 return 0;
1282 no_more_source:
1283 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1285 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1286 return 0;
1288 if (bom_found)
1290 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1291 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1293 else
1295 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1296 if (nchars < src_end - coding->source)
1297 /* The found characters are less than source bytes, which
1298 means that we found a valid non-ASCII characters. */
1299 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1301 coding->detected_utf8_chars = nchars;
1302 return 1;
1306 static void
1307 decode_coding_utf_8 (struct coding_system *coding)
1309 const unsigned char *src = coding->source + coding->consumed;
1310 const unsigned char *src_end = coding->source + coding->src_bytes;
1311 const unsigned char *src_base;
1312 int *charbuf = coding->charbuf + coding->charbuf_used;
1313 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1314 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1315 bool multibytep = coding->src_multibyte;
1316 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1317 bool eol_dos
1318 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1319 int byte_after_cr = -1;
1321 if (bom != utf_without_bom)
1323 int c1, c2, c3;
1325 src_base = src;
1326 ONE_MORE_BYTE (c1);
1327 if (! UTF_8_3_OCTET_LEADING_P (c1))
1328 src = src_base;
1329 else
1331 ONE_MORE_BYTE (c2);
1332 if (! UTF_8_EXTRA_OCTET_P (c2))
1333 src = src_base;
1334 else
1336 ONE_MORE_BYTE (c3);
1337 if (! UTF_8_EXTRA_OCTET_P (c3))
1338 src = src_base;
1339 else
1341 if ((c1 != UTF_8_BOM_1)
1342 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1343 src = src_base;
1344 else
1345 CODING_UTF_8_BOM (coding) = utf_without_bom;
1350 CODING_UTF_8_BOM (coding) = utf_without_bom;
1352 while (1)
1354 int c, c1, c2, c3, c4, c5;
1356 src_base = src;
1357 consumed_chars_base = consumed_chars;
1359 if (charbuf >= charbuf_end)
1361 if (byte_after_cr >= 0)
1362 src_base--;
1363 break;
1366 if (byte_after_cr >= 0)
1367 c1 = byte_after_cr, byte_after_cr = -1;
1368 else
1369 ONE_MORE_BYTE (c1);
1370 if (c1 < 0)
1372 c = - c1;
1374 else if (UTF_8_1_OCTET_P (c1))
1376 if (eol_dos && c1 == '\r')
1377 ONE_MORE_BYTE (byte_after_cr);
1378 c = c1;
1380 else
1382 ONE_MORE_BYTE (c2);
1383 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1384 goto invalid_code;
1385 if (UTF_8_2_OCTET_LEADING_P (c1))
1387 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1388 /* Reject overlong sequences here and below. Encoders
1389 producing them are incorrect, they can be misleading,
1390 and they mess up read/write invariance. */
1391 if (c < 128)
1392 goto invalid_code;
1394 else
1396 ONE_MORE_BYTE (c3);
1397 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1398 goto invalid_code;
1399 if (UTF_8_3_OCTET_LEADING_P (c1))
1401 c = (((c1 & 0xF) << 12)
1402 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1403 if (c < 0x800
1404 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1405 goto invalid_code;
1407 else
1409 ONE_MORE_BYTE (c4);
1410 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1411 goto invalid_code;
1412 if (UTF_8_4_OCTET_LEADING_P (c1))
1414 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1415 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1416 if (c < 0x10000)
1417 goto invalid_code;
1419 else
1421 ONE_MORE_BYTE (c5);
1422 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1423 goto invalid_code;
1424 if (UTF_8_5_OCTET_LEADING_P (c1))
1426 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1427 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1428 | (c5 & 0x3F));
1429 if ((c > MAX_CHAR) || (c < 0x200000))
1430 goto invalid_code;
1432 else
1433 goto invalid_code;
1439 *charbuf++ = c;
1440 continue;
1442 invalid_code:
1443 src = src_base;
1444 consumed_chars = consumed_chars_base;
1445 ONE_MORE_BYTE (c);
1446 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1447 coding->errors++;
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
1457 static bool
1458 encode_coding_utf_8 (struct coding_system *coding)
1460 bool multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 ptrdiff_t produced_chars = 0;
1466 int c;
1468 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1470 ASSURE_DESTINATION (3);
1471 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1472 CODING_UTF_8_BOM (coding) = utf_without_bom;
1475 if (multibytep)
1477 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1479 while (charbuf < charbuf_end)
1481 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1483 ASSURE_DESTINATION (safe_room);
1484 c = *charbuf++;
1485 if (CHAR_BYTE8_P (c))
1487 c = CHAR_TO_BYTE8 (c);
1488 EMIT_ONE_BYTE (c);
1490 else
1492 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1493 for (p = str; p < pend; p++)
1494 EMIT_ONE_BYTE (*p);
1498 else
1500 int safe_room = MAX_MULTIBYTE_LENGTH;
1502 while (charbuf < charbuf_end)
1504 ASSURE_DESTINATION (safe_room);
1505 c = *charbuf++;
1506 if (CHAR_BYTE8_P (c))
1507 *dst++ = CHAR_TO_BYTE8 (c);
1508 else
1509 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1510 produced_chars++;
1513 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1514 coding->produced_char += produced_chars;
1515 coding->produced = dst - coding->destination;
1516 return 0;
1520 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1521 Return true if a text is encoded in one of UTF-16 based coding systems. */
1523 #define UTF_16_HIGH_SURROGATE_P(val) \
1524 (((val) & 0xFC00) == 0xD800)
1526 #define UTF_16_LOW_SURROGATE_P(val) \
1527 (((val) & 0xFC00) == 0xDC00)
1530 static bool
1531 detect_coding_utf_16 (struct coding_system *coding,
1532 struct coding_detection_info *detect_info)
1534 const unsigned char *src = coding->source;
1535 const unsigned char *src_end = coding->source + coding->src_bytes;
1536 bool multibytep = coding->src_multibyte;
1537 int c1, c2;
1539 detect_info->checked |= CATEGORY_MASK_UTF_16;
1540 if (coding->mode & CODING_MODE_LAST_BLOCK
1541 && (coding->src_chars & 1))
1543 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1544 return 0;
1547 TWO_MORE_BYTES (c1, c2);
1548 if ((c1 == 0xFF) && (c2 == 0xFE))
1550 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1551 | CATEGORY_MASK_UTF_16_AUTO);
1552 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1553 | CATEGORY_MASK_UTF_16_BE_NOSIG
1554 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1556 else if ((c1 == 0xFE) && (c2 == 0xFF))
1558 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1559 | CATEGORY_MASK_UTF_16_AUTO);
1560 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1561 | CATEGORY_MASK_UTF_16_BE_NOSIG
1562 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1564 else if (c2 < 0)
1566 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1567 return 0;
1569 else
1571 /* We check the dispersion of Eth and Oth bytes where E is even and
1572 O is odd. If both are high, we assume binary data.*/
1573 unsigned char e[256], o[256];
1574 unsigned e_num = 1, o_num = 1;
1576 memset (e, 0, 256);
1577 memset (o, 0, 256);
1578 e[c1] = 1;
1579 o[c2] = 1;
1581 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1582 |CATEGORY_MASK_UTF_16_BE
1583 | CATEGORY_MASK_UTF_16_LE);
1585 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1586 != CATEGORY_MASK_UTF_16)
1588 TWO_MORE_BYTES (c1, c2);
1589 if (c2 < 0)
1590 break;
1591 if (! e[c1])
1593 e[c1] = 1;
1594 e_num++;
1595 if (e_num >= 128)
1596 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1598 if (! o[c2])
1600 o[c2] = 1;
1601 o_num++;
1602 if (o_num >= 128)
1603 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1606 return 0;
1609 no_more_source:
1610 return 1;
1613 static void
1614 decode_coding_utf_16 (struct coding_system *coding)
1616 const unsigned char *src = coding->source + coding->consumed;
1617 const unsigned char *src_end = coding->source + coding->src_bytes;
1618 const unsigned char *src_base;
1619 int *charbuf = coding->charbuf + coding->charbuf_used;
1620 /* We may produces at most 3 chars in one loop. */
1621 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1622 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1623 bool multibytep = coding->src_multibyte;
1624 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1625 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1626 int surrogate = CODING_UTF_16_SURROGATE (coding);
1627 bool eol_dos
1628 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1629 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1631 if (bom == utf_with_bom)
1633 int c, c1, c2;
1635 src_base = src;
1636 ONE_MORE_BYTE (c1);
1637 ONE_MORE_BYTE (c2);
1638 c = (c1 << 8) | c2;
1640 if (endian == utf_16_big_endian
1641 ? c != 0xFEFF : c != 0xFFFE)
1643 /* The first two bytes are not BOM. Treat them as bytes
1644 for a normal character. */
1645 src = src_base;
1646 coding->errors++;
1648 CODING_UTF_16_BOM (coding) = utf_without_bom;
1650 else if (bom == utf_detect_bom)
1652 /* We have already tried to detect BOM and failed in
1653 detect_coding. */
1654 CODING_UTF_16_BOM (coding) = utf_without_bom;
1657 while (1)
1659 int c, c1, c2;
1661 src_base = src;
1662 consumed_chars_base = consumed_chars;
1664 if (charbuf >= charbuf_end)
1666 if (byte_after_cr1 >= 0)
1667 src_base -= 2;
1668 break;
1671 if (byte_after_cr1 >= 0)
1672 c1 = byte_after_cr1, byte_after_cr1 = -1;
1673 else
1674 ONE_MORE_BYTE (c1);
1675 if (c1 < 0)
1677 *charbuf++ = -c1;
1678 continue;
1680 if (byte_after_cr2 >= 0)
1681 c2 = byte_after_cr2, byte_after_cr2 = -1;
1682 else
1683 ONE_MORE_BYTE (c2);
1684 if (c2 < 0)
1686 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1687 *charbuf++ = -c2;
1688 continue;
1690 c = (endian == utf_16_big_endian
1691 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1693 if (surrogate)
1695 if (! UTF_16_LOW_SURROGATE_P (c))
1697 if (endian == utf_16_big_endian)
1698 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1699 else
1700 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1701 *charbuf++ = c1;
1702 *charbuf++ = c2;
1703 coding->errors++;
1704 if (UTF_16_HIGH_SURROGATE_P (c))
1705 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706 else
1707 *charbuf++ = c;
1709 else
1711 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1712 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1713 *charbuf++ = 0x10000 + c;
1716 else
1718 if (UTF_16_HIGH_SURROGATE_P (c))
1719 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1720 else
1722 if (eol_dos && c == '\r')
1724 ONE_MORE_BYTE (byte_after_cr1);
1725 ONE_MORE_BYTE (byte_after_cr2);
1727 *charbuf++ = c;
1732 no_more_source:
1733 coding->consumed_char += consumed_chars_base;
1734 coding->consumed = src_base - coding->source;
1735 coding->charbuf_used = charbuf - coding->charbuf;
1738 static bool
1739 encode_coding_utf_16 (struct coding_system *coding)
1741 bool multibytep = coding->dst_multibyte;
1742 int *charbuf = coding->charbuf;
1743 int *charbuf_end = charbuf + coding->charbuf_used;
1744 unsigned char *dst = coding->destination + coding->produced;
1745 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1746 int safe_room = 8;
1747 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1748 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1749 ptrdiff_t produced_chars = 0;
1750 int c;
1752 if (bom != utf_without_bom)
1754 ASSURE_DESTINATION (safe_room);
1755 if (big_endian)
1756 EMIT_TWO_BYTES (0xFE, 0xFF);
1757 else
1758 EMIT_TWO_BYTES (0xFF, 0xFE);
1759 CODING_UTF_16_BOM (coding) = utf_without_bom;
1762 while (charbuf < charbuf_end)
1764 ASSURE_DESTINATION (safe_room);
1765 c = *charbuf++;
1766 if (c > MAX_UNICODE_CHAR)
1767 c = coding->default_char;
1769 if (c < 0x10000)
1771 if (big_endian)
1772 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1773 else
1774 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1776 else
1778 int c1, c2;
1780 c -= 0x10000;
1781 c1 = (c >> 10) + 0xD800;
1782 c2 = (c & 0x3FF) + 0xDC00;
1783 if (big_endian)
1784 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1785 else
1786 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1789 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1790 coding->produced = dst - coding->destination;
1791 coding->produced_char += produced_chars;
1792 return 0;
1796 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1798 /* Emacs' internal format for representation of multiple character
1799 sets is a kind of multi-byte encoding, i.e. characters are
1800 represented by variable-length sequences of one-byte codes.
1802 ASCII characters and control characters (e.g. `tab', `newline') are
1803 represented by one-byte sequences which are their ASCII codes, in
1804 the range 0x00 through 0x7F.
1806 8-bit characters of the range 0x80..0x9F are represented by
1807 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1808 code + 0x20).
1810 8-bit characters of the range 0xA0..0xFF are represented by
1811 one-byte sequences which are their 8-bit code.
1813 The other characters are represented by a sequence of `base
1814 leading-code', optional `extended leading-code', and one or two
1815 `position-code's. The length of the sequence is determined by the
1816 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1817 whereas extended leading-code and position-code take the range 0xA0
1818 through 0xFF. See `charset.h' for more details about leading-code
1819 and position-code.
1821 --- CODE RANGE of Emacs' internal format ---
1822 character set range
1823 ------------- -----
1824 ascii 0x00..0x7F
1825 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1826 eight-bit-graphic 0xA0..0xBF
1827 ELSE 0x81..0x9D + [0xA0..0xFF]+
1828 ---------------------------------------------
1830 As this is the internal character representation, the format is
1831 usually not used externally (i.e. in a file or in a data sent to a
1832 process). But, it is possible to have a text externally in this
1833 format (i.e. by encoding by the coding system `emacs-mule').
1835 In that case, a sequence of one-byte codes has a slightly different
1836 form.
1838 At first, all characters in eight-bit-control are represented by
1839 one-byte sequences which are their 8-bit code.
1841 Next, character composition data are represented by the byte
1842 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1843 where,
1844 METHOD is 0xF2 plus one of composition method (enum
1845 composition_method),
1847 BYTES is 0xA0 plus a byte length of this composition data,
1849 CHARS is 0xA0 plus a number of characters composed by this
1850 data,
1852 COMPONENTs are characters of multibyte form or composition
1853 rules encoded by two-byte of ASCII codes.
1855 In addition, for backward compatibility, the following formats are
1856 also recognized as composition data on decoding.
1858 0x80 MSEQ ...
1859 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1861 Here,
1862 MSEQ is a multibyte form but in these special format:
1863 ASCII: 0xA0 ASCII_CODE+0x80,
1864 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1865 RULE is a one byte code of the range 0xA0..0xF0 that
1866 represents a composition rule.
1869 char emacs_mule_bytes[256];
1872 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1873 Return true if a text is encoded in 'emacs-mule'. */
1875 static bool
1876 detect_coding_emacs_mule (struct coding_system *coding,
1877 struct coding_detection_info *detect_info)
1879 const unsigned char *src = coding->source, *src_base;
1880 const unsigned char *src_end = coding->source + coding->src_bytes;
1881 bool multibytep = coding->src_multibyte;
1882 ptrdiff_t consumed_chars = 0;
1883 int c;
1884 int found = 0;
1886 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1887 /* A coding system of this category is always ASCII compatible. */
1888 src += coding->head_ascii;
1890 while (1)
1892 src_base = src;
1893 ONE_MORE_BYTE (c);
1894 if (c < 0)
1895 continue;
1896 if (c == 0x80)
1898 /* Perhaps the start of composite character. We simply skip
1899 it because analyzing it is too heavy for detecting. But,
1900 at least, we check that the composite character
1901 constitutes of more than 4 bytes. */
1902 const unsigned char *src_start;
1904 repeat:
1905 src_start = src;
1908 ONE_MORE_BYTE (c);
1910 while (c >= 0xA0);
1912 if (src - src_start <= 4)
1913 break;
1914 found = CATEGORY_MASK_EMACS_MULE;
1915 if (c == 0x80)
1916 goto repeat;
1919 if (c < 0x80)
1921 if (c < 0x20
1922 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1923 break;
1925 else
1927 int more_bytes = emacs_mule_bytes[c] - 1;
1929 while (more_bytes > 0)
1931 ONE_MORE_BYTE (c);
1932 if (c < 0xA0)
1934 src--; /* Unread the last byte. */
1935 break;
1937 more_bytes--;
1939 if (more_bytes != 0)
1940 break;
1941 found = CATEGORY_MASK_EMACS_MULE;
1944 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945 return 0;
1947 no_more_source:
1948 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1950 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1951 return 0;
1953 detect_info->found |= found;
1954 return 1;
1958 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1959 character. If CMP_STATUS indicates that we must expect MSEQ or
1960 RULE described above, decode it and return the negative value of
1961 the decoded character or rule. If an invalid byte is found, return
1962 -1. If SRC is too short, return -2. */
1964 static int
1965 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1966 int *nbytes, int *nchars, int *id,
1967 struct composition_status *cmp_status)
1969 const unsigned char *src_end = coding->source + coding->src_bytes;
1970 const unsigned char *src_base = src;
1971 bool multibytep = coding->src_multibyte;
1972 int charset_ID;
1973 unsigned code;
1974 int c;
1975 int consumed_chars = 0;
1976 bool mseq_found = 0;
1978 ONE_MORE_BYTE (c);
1979 if (c < 0)
1981 c = -c;
1982 charset_ID = emacs_mule_charset[0];
1984 else
1986 if (c >= 0xA0)
1988 if (cmp_status->state != COMPOSING_NO
1989 && cmp_status->old_form)
1991 if (cmp_status->state == COMPOSING_CHAR)
1993 if (c == 0xA0)
1995 ONE_MORE_BYTE (c);
1996 c -= 0x80;
1997 if (c < 0)
1998 goto invalid_code;
2000 else
2001 c -= 0x20;
2002 mseq_found = 1;
2004 else
2006 *nbytes = src - src_base;
2007 *nchars = consumed_chars;
2008 return -c;
2011 else
2012 goto invalid_code;
2015 switch (emacs_mule_bytes[c])
2017 case 2:
2018 if ((charset_ID = emacs_mule_charset[c]) < 0)
2019 goto invalid_code;
2020 ONE_MORE_BYTE (c);
2021 if (c < 0xA0)
2022 goto invalid_code;
2023 code = c & 0x7F;
2024 break;
2026 case 3:
2027 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2028 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2030 ONE_MORE_BYTE (c);
2031 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2032 goto invalid_code;
2033 ONE_MORE_BYTE (c);
2034 if (c < 0xA0)
2035 goto invalid_code;
2036 code = c & 0x7F;
2038 else
2040 if ((charset_ID = emacs_mule_charset[c]) < 0)
2041 goto invalid_code;
2042 ONE_MORE_BYTE (c);
2043 if (c < 0xA0)
2044 goto invalid_code;
2045 code = (c & 0x7F) << 8;
2046 ONE_MORE_BYTE (c);
2047 if (c < 0xA0)
2048 goto invalid_code;
2049 code |= c & 0x7F;
2051 break;
2053 case 4:
2054 ONE_MORE_BYTE (c);
2055 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2056 goto invalid_code;
2057 ONE_MORE_BYTE (c);
2058 if (c < 0xA0)
2059 goto invalid_code;
2060 code = (c & 0x7F) << 8;
2061 ONE_MORE_BYTE (c);
2062 if (c < 0xA0)
2063 goto invalid_code;
2064 code |= c & 0x7F;
2065 break;
2067 case 1:
2068 code = c;
2069 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2070 break;
2072 default:
2073 emacs_abort ();
2075 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2076 CHARSET_FROM_ID (charset_ID), code, c);
2077 if (c < 0)
2078 goto invalid_code;
2080 *nbytes = src - src_base;
2081 *nchars = consumed_chars;
2082 if (id)
2083 *id = charset_ID;
2084 return (mseq_found ? -c : c);
2086 no_more_source:
2087 return -2;
2089 invalid_code:
2090 return -1;
2094 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2096 /* Handle these composition sequence ('|': the end of header elements,
2097 BYTES and CHARS >= 0xA0):
2099 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2100 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2101 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2103 and these old form:
2105 (4) relative composition: 0x80 | MSEQ ... MSEQ
2106 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2108 When the starter 0x80 and the following header elements are found,
2109 this annotation header is produced.
2111 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2113 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2114 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2116 Then, upon reading the following elements, these codes are produced
2117 until the composition end is found:
2119 (1) CHAR ... CHAR
2120 (2) ALT ... ALT CHAR ... CHAR
2121 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2122 (4) CHAR ... CHAR
2123 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2125 When the composition end is found, LENGTH and NCHARS in the
2126 annotation header is updated as below:
2128 (1) LENGTH: unchanged, NCHARS: unchanged
2129 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2130 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2131 (4) LENGTH: unchanged, NCHARS: number of CHARs
2132 (5) LENGTH: unchanged, NCHARS: number of CHARs
2134 If an error is found while composing, the annotation header is
2135 changed to the original composition header (plus filler -1s) as
2136 below:
2138 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2139 (5) [ 0x80 0xFF -1 -1- -1 ]
2141 and the sequence [ -2 DECODED-RULE ] is changed to the original
2142 byte sequence as below:
2143 o the original byte sequence is B: [ B -1 ]
2144 o the original byte sequence is B1 B2: [ B1 B2 ]
2146 Most of the routines are implemented by macros because many
2147 variables and labels in the caller decode_coding_emacs_mule must be
2148 accessible, and they are usually called just once (thus doesn't
2149 increase the size of compiled object). */
2151 /* Decode a composition rule represented by C as a component of
2152 composition sequence of Emacs 20 style. Set RULE to the decoded
2153 rule. */
2155 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2156 do { \
2157 int gref, nref; \
2159 c -= 0xA0; \
2160 if (c < 0 || c >= 81) \
2161 goto invalid_code; \
2162 gref = c / 9, nref = c % 9; \
2163 if (gref == 4) gref = 10; \
2164 if (nref == 4) nref = 10; \
2165 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2166 } while (0)
2169 /* Decode a composition rule represented by C and the following byte
2170 at SRC as a component of composition sequence of Emacs 21 style.
2171 Set RULE to the decoded rule. */
2173 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2174 do { \
2175 int gref, nref; \
2177 gref = c - 0x20; \
2178 if (gref < 0 || gref >= 81) \
2179 goto invalid_code; \
2180 ONE_MORE_BYTE (c); \
2181 nref = c - 0x20; \
2182 if (nref < 0 || nref >= 81) \
2183 goto invalid_code; \
2184 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2185 } while (0)
2188 /* Start of Emacs 21 style format. The first three bytes at SRC are
2189 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2190 byte length of this composition information, CHARS is the number of
2191 characters composed by this composition. */
2193 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2194 do { \
2195 enum composition_method method = c - 0xF2; \
2196 int nbytes, nchars; \
2198 ONE_MORE_BYTE (c); \
2199 if (c < 0) \
2200 goto invalid_code; \
2201 nbytes = c - 0xA0; \
2202 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2203 goto invalid_code; \
2204 ONE_MORE_BYTE (c); \
2205 nchars = c - 0xA0; \
2206 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2207 goto invalid_code; \
2208 cmp_status->old_form = 0; \
2209 cmp_status->method = method; \
2210 if (method == COMPOSITION_RELATIVE) \
2211 cmp_status->state = COMPOSING_CHAR; \
2212 else \
2213 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2214 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2215 cmp_status->nchars = nchars; \
2216 cmp_status->ncomps = nbytes - 4; \
2217 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2218 } while (0)
2221 /* Start of Emacs 20 style format for relative composition. */
2223 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2224 do { \
2225 cmp_status->old_form = 1; \
2226 cmp_status->method = COMPOSITION_RELATIVE; \
2227 cmp_status->state = COMPOSING_CHAR; \
2228 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2229 cmp_status->nchars = cmp_status->ncomps = 0; \
2230 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2231 } while (0)
2234 /* Start of Emacs 20 style format for rule-base composition. */
2236 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2237 do { \
2238 cmp_status->old_form = 1; \
2239 cmp_status->method = COMPOSITION_WITH_RULE; \
2240 cmp_status->state = COMPOSING_CHAR; \
2241 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2242 cmp_status->nchars = cmp_status->ncomps = 0; \
2243 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2244 } while (0)
2247 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2248 do { \
2249 const unsigned char *current_src = src; \
2251 ONE_MORE_BYTE (c); \
2252 if (c < 0) \
2253 goto invalid_code; \
2254 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2255 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2256 DECODE_EMACS_MULE_21_COMPOSITION (); \
2257 else if (c < 0xA0) \
2258 goto invalid_code; \
2259 else if (c < 0xC0) \
2261 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2262 /* Re-read C as a composition component. */ \
2263 src = current_src; \
2265 else if (c == 0xFF) \
2266 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2267 else \
2268 goto invalid_code; \
2269 } while (0)
2271 #define EMACS_MULE_COMPOSITION_END() \
2272 do { \
2273 int idx = - cmp_status->length; \
2275 if (cmp_status->old_form) \
2276 charbuf[idx + 2] = cmp_status->nchars; \
2277 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2278 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2279 cmp_status->state = COMPOSING_NO; \
2280 } while (0)
2283 static int
2284 emacs_mule_finish_composition (int *charbuf,
2285 struct composition_status *cmp_status)
2287 int idx = - cmp_status->length;
2288 int new_chars;
2290 if (cmp_status->old_form && cmp_status->nchars > 0)
2292 charbuf[idx + 2] = cmp_status->nchars;
2293 new_chars = 0;
2294 if (cmp_status->method == COMPOSITION_WITH_RULE
2295 && cmp_status->state == COMPOSING_CHAR)
2297 /* The last rule was invalid. */
2298 int rule = charbuf[-1] + 0xA0;
2300 charbuf[-2] = BYTE8_TO_CHAR (rule);
2301 charbuf[-1] = -1;
2302 new_chars = 1;
2305 else
2307 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2309 if (cmp_status->method == COMPOSITION_WITH_RULE)
2311 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2312 charbuf[idx++] = -3;
2313 charbuf[idx++] = 0;
2314 new_chars = 1;
2316 else
2318 int nchars = charbuf[idx + 1] + 0xA0;
2319 int nbytes = charbuf[idx + 2] + 0xA0;
2321 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2322 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2323 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2324 charbuf[idx++] = -1;
2325 new_chars = 4;
2328 cmp_status->state = COMPOSING_NO;
2329 return new_chars;
2332 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2333 do { \
2334 if (cmp_status->state != COMPOSING_NO) \
2335 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2336 } while (0)
2339 static void
2340 decode_coding_emacs_mule (struct coding_system *coding)
2342 const unsigned char *src = coding->source + coding->consumed;
2343 const unsigned char *src_end = coding->source + coding->src_bytes;
2344 const unsigned char *src_base;
2345 int *charbuf = coding->charbuf + coding->charbuf_used;
2346 /* We may produce two annotations (charset and composition) in one
2347 loop and one more charset annotation at the end. */
2348 int *charbuf_end
2349 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2350 /* We can produce up to 2 characters in a loop. */
2351 - 1;
2352 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2353 bool multibytep = coding->src_multibyte;
2354 ptrdiff_t char_offset = coding->produced_char;
2355 ptrdiff_t last_offset = char_offset;
2356 int last_id = charset_ascii;
2357 bool eol_dos
2358 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2359 int byte_after_cr = -1;
2360 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2362 if (cmp_status->state != COMPOSING_NO)
2364 int i;
2366 if (charbuf_end - charbuf < cmp_status->length)
2367 emacs_abort ();
2368 for (i = 0; i < cmp_status->length; i++)
2369 *charbuf++ = cmp_status->carryover[i];
2370 coding->annotated = 1;
2373 while (1)
2375 int c, id IF_LINT (= 0);
2377 src_base = src;
2378 consumed_chars_base = consumed_chars;
2380 if (charbuf >= charbuf_end)
2382 if (byte_after_cr >= 0)
2383 src_base--;
2384 break;
2387 if (byte_after_cr >= 0)
2388 c = byte_after_cr, byte_after_cr = -1;
2389 else
2390 ONE_MORE_BYTE (c);
2392 if (c < 0 || c == 0x80)
2394 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395 if (c < 0)
2397 *charbuf++ = -c;
2398 char_offset++;
2400 else
2401 DECODE_EMACS_MULE_COMPOSITION_START ();
2402 continue;
2405 if (c < 0x80)
2407 if (eol_dos && c == '\r')
2408 ONE_MORE_BYTE (byte_after_cr);
2409 id = charset_ascii;
2410 if (cmp_status->state != COMPOSING_NO)
2412 if (cmp_status->old_form)
2413 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415 cmp_status->ncomps--;
2418 else
2420 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2421 /* emacs_mule_char can load a charset map from a file, which
2422 allocates a large structure and might cause buffer text
2423 to be relocated as result. Thus, we need to remember the
2424 original pointer to buffer text, and fix up all related
2425 pointers after the call. */
2426 const unsigned char *orig = coding->source;
2427 ptrdiff_t offset;
2429 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2430 cmp_status);
2431 offset = coding->source - orig;
2432 if (offset)
2434 src += offset;
2435 src_base += offset;
2436 src_end += offset;
2438 if (c < 0)
2440 if (c == -1)
2441 goto invalid_code;
2442 if (c == -2)
2443 break;
2445 src = src_base + nbytes;
2446 consumed_chars = consumed_chars_base + nchars;
2447 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2448 cmp_status->ncomps -= nchars;
2451 /* Now if C >= 0, we found a normally encoded character, if C <
2452 0, we found an old-style composition component character or
2453 rule. */
2455 if (cmp_status->state == COMPOSING_NO)
2457 if (last_id != id)
2459 if (last_id != charset_ascii)
2460 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2461 last_id);
2462 last_id = id;
2463 last_offset = char_offset;
2465 *charbuf++ = c;
2466 char_offset++;
2468 else if (cmp_status->state == COMPOSING_CHAR)
2470 if (cmp_status->old_form)
2472 if (c >= 0)
2474 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475 *charbuf++ = c;
2476 char_offset++;
2478 else
2480 *charbuf++ = -c;
2481 cmp_status->nchars++;
2482 cmp_status->length++;
2483 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2484 EMACS_MULE_COMPOSITION_END ();
2485 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2486 cmp_status->state = COMPOSING_RULE;
2489 else
2491 *charbuf++ = c;
2492 cmp_status->length++;
2493 cmp_status->nchars--;
2494 if (cmp_status->nchars == 0)
2495 EMACS_MULE_COMPOSITION_END ();
2498 else if (cmp_status->state == COMPOSING_RULE)
2500 int rule;
2502 if (c >= 0)
2504 EMACS_MULE_COMPOSITION_END ();
2505 *charbuf++ = c;
2506 char_offset++;
2508 else
2510 c = -c;
2511 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2512 if (rule < 0)
2513 goto invalid_code;
2514 *charbuf++ = -2;
2515 *charbuf++ = rule;
2516 cmp_status->length += 2;
2517 cmp_status->state = COMPOSING_CHAR;
2520 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2522 *charbuf++ = c;
2523 cmp_status->length++;
2524 if (cmp_status->ncomps == 0)
2525 cmp_status->state = COMPOSING_CHAR;
2526 else if (cmp_status->ncomps > 0)
2528 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2529 cmp_status->state = COMPOSING_COMPONENT_RULE;
2531 else
2532 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534 else /* COMPOSING_COMPONENT_RULE */
2536 int rule;
2538 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2539 if (rule < 0)
2540 goto invalid_code;
2541 *charbuf++ = -2;
2542 *charbuf++ = rule;
2543 cmp_status->length += 2;
2544 cmp_status->ncomps--;
2545 if (cmp_status->ncomps > 0)
2546 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2547 else
2548 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550 continue;
2552 invalid_code:
2553 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2554 src = src_base;
2555 consumed_chars = consumed_chars_base;
2556 ONE_MORE_BYTE (c);
2557 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2558 char_offset++;
2559 coding->errors++;
2562 no_more_source:
2563 if (cmp_status->state != COMPOSING_NO)
2565 if (coding->mode & CODING_MODE_LAST_BLOCK)
2566 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2567 else
2569 int i;
2571 charbuf -= cmp_status->length;
2572 for (i = 0; i < cmp_status->length; i++)
2573 cmp_status->carryover[i] = charbuf[i];
2576 if (last_id != charset_ascii)
2577 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2578 coding->consumed_char += consumed_chars_base;
2579 coding->consumed = src_base - coding->source;
2580 coding->charbuf_used = charbuf - coding->charbuf;
2584 #define EMACS_MULE_LEADING_CODES(id, codes) \
2585 do { \
2586 if (id < 0xA0) \
2587 codes[0] = id, codes[1] = 0; \
2588 else if (id < 0xE0) \
2589 codes[0] = 0x9A, codes[1] = id; \
2590 else if (id < 0xF0) \
2591 codes[0] = 0x9B, codes[1] = id; \
2592 else if (id < 0xF5) \
2593 codes[0] = 0x9C, codes[1] = id; \
2594 else \
2595 codes[0] = 0x9D, codes[1] = id; \
2596 } while (0);
2599 static bool
2600 encode_coding_emacs_mule (struct coding_system *coding)
2602 bool multibytep = coding->dst_multibyte;
2603 int *charbuf = coding->charbuf;
2604 int *charbuf_end = charbuf + coding->charbuf_used;
2605 unsigned char *dst = coding->destination + coding->produced;
2606 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2607 int safe_room = 8;
2608 ptrdiff_t produced_chars = 0;
2609 Lisp_Object attrs, charset_list;
2610 int c;
2611 int preferred_charset_id = -1;
2613 CODING_GET_INFO (coding, attrs, charset_list);
2614 if (! EQ (charset_list, Vemacs_mule_charset_list))
2616 charset_list = Vemacs_mule_charset_list;
2617 ASET (attrs, coding_attr_charset_list, charset_list);
2620 while (charbuf < charbuf_end)
2622 ASSURE_DESTINATION (safe_room);
2623 c = *charbuf++;
2625 if (c < 0)
2627 /* Handle an annotation. */
2628 switch (*charbuf)
2630 case CODING_ANNOTATE_COMPOSITION_MASK:
2631 /* Not yet implemented. */
2632 break;
2633 case CODING_ANNOTATE_CHARSET_MASK:
2634 preferred_charset_id = charbuf[3];
2635 if (preferred_charset_id >= 0
2636 && NILP (Fmemq (make_number (preferred_charset_id),
2637 charset_list)))
2638 preferred_charset_id = -1;
2639 break;
2640 default:
2641 emacs_abort ();
2643 charbuf += -c - 1;
2644 continue;
2647 if (ASCII_CHAR_P (c))
2648 EMIT_ONE_ASCII_BYTE (c);
2649 else if (CHAR_BYTE8_P (c))
2651 c = CHAR_TO_BYTE8 (c);
2652 EMIT_ONE_BYTE (c);
2654 else
2656 struct charset *charset;
2657 unsigned code;
2658 int dimension;
2659 int emacs_mule_id;
2660 unsigned char leading_codes[2];
2662 if (preferred_charset_id >= 0)
2664 bool result;
2666 charset = CHARSET_FROM_ID (preferred_charset_id);
2667 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2668 if (result)
2669 code = ENCODE_CHAR (charset, c);
2670 else
2671 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2672 &code, charset);
2674 else
2675 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2676 &code, charset);
2677 if (! charset)
2679 c = coding->default_char;
2680 if (ASCII_CHAR_P (c))
2682 EMIT_ONE_ASCII_BYTE (c);
2683 continue;
2685 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2686 &code, charset);
2688 dimension = CHARSET_DIMENSION (charset);
2689 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2690 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2691 EMIT_ONE_BYTE (leading_codes[0]);
2692 if (leading_codes[1])
2693 EMIT_ONE_BYTE (leading_codes[1]);
2694 if (dimension == 1)
2695 EMIT_ONE_BYTE (code | 0x80);
2696 else
2698 code |= 0x8080;
2699 EMIT_ONE_BYTE (code >> 8);
2700 EMIT_ONE_BYTE (code & 0xFF);
2704 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2705 coding->produced_char += produced_chars;
2706 coding->produced = dst - coding->destination;
2707 return 0;
2711 /*** 7. ISO2022 handlers ***/
2713 /* The following note describes the coding system ISO2022 briefly.
2714 Since the intention of this note is to help understand the
2715 functions in this file, some parts are NOT ACCURATE or are OVERLY
2716 SIMPLIFIED. For thorough understanding, please refer to the
2717 original document of ISO2022. This is equivalent to the standard
2718 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2720 ISO2022 provides many mechanisms to encode several character sets
2721 in 7-bit and 8-bit environments. For 7-bit environments, all text
2722 is encoded using bytes less than 128. This may make the encoded
2723 text a little bit longer, but the text passes more easily through
2724 several types of gateway, some of which strip off the MSB (Most
2725 Significant Bit).
2727 There are two kinds of character sets: control character sets and
2728 graphic character sets. The former contain control characters such
2729 as `newline' and `escape' to provide control functions (control
2730 functions are also provided by escape sequences). The latter
2731 contain graphic characters such as 'A' and '-'. Emacs recognizes
2732 two control character sets and many graphic character sets.
2734 Graphic character sets are classified into one of the following
2735 four classes, according to the number of bytes (DIMENSION) and
2736 number of characters in one dimension (CHARS) of the set:
2737 - DIMENSION1_CHARS94
2738 - DIMENSION1_CHARS96
2739 - DIMENSION2_CHARS94
2740 - DIMENSION2_CHARS96
2742 In addition, each character set is assigned an identification tag,
2743 unique for each set, called the "final character" (denoted as <F>
2744 hereafter). The <F> of each character set is decided by ECMA(*)
2745 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2746 (0x30..0x3F are for private use only).
2748 Note (*): ECMA = European Computer Manufacturers Association
2750 Here are examples of graphic character sets [NAME(<F>)]:
2751 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2752 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2753 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2754 o DIMENSION2_CHARS96 -- none for the moment
2756 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2757 C0 [0x00..0x1F] -- control character plane 0
2758 GL [0x20..0x7F] -- graphic character plane 0
2759 C1 [0x80..0x9F] -- control character plane 1
2760 GR [0xA0..0xFF] -- graphic character plane 1
2762 A control character set is directly designated and invoked to C0 or
2763 C1 by an escape sequence. The most common case is that:
2764 - ISO646's control character set is designated/invoked to C0, and
2765 - ISO6429's control character set is designated/invoked to C1,
2766 and usually these designations/invocations are omitted in encoded
2767 text. In a 7-bit environment, only C0 can be used, and a control
2768 character for C1 is encoded by an appropriate escape sequence to
2769 fit into the environment. All control characters for C1 are
2770 defined to have corresponding escape sequences.
2772 A graphic character set is at first designated to one of four
2773 graphic registers (G0 through G3), then these graphic registers are
2774 invoked to GL or GR. These designations and invocations can be
2775 done independently. The most common case is that G0 is invoked to
2776 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2777 these invocations and designations are omitted in encoded text.
2778 In a 7-bit environment, only GL can be used.
2780 When a graphic character set of CHARS94 is invoked to GL, codes
2781 0x20 and 0x7F of the GL area work as control characters SPACE and
2782 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2783 be used.
2785 There are two ways of invocation: locking-shift and single-shift.
2786 With locking-shift, the invocation lasts until the next different
2787 invocation, whereas with single-shift, the invocation affects the
2788 following character only and doesn't affect the locking-shift
2789 state. Invocations are done by the following control characters or
2790 escape sequences:
2792 ----------------------------------------------------------------------
2793 abbrev function cntrl escape seq description
2794 ----------------------------------------------------------------------
2795 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2796 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2797 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2798 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2799 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2800 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2801 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2802 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2803 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2804 ----------------------------------------------------------------------
2805 (*) These are not used by any known coding system.
2807 Control characters for these functions are defined by macros
2808 ISO_CODE_XXX in `coding.h'.
2810 Designations are done by the following escape sequences:
2811 ----------------------------------------------------------------------
2812 escape sequence description
2813 ----------------------------------------------------------------------
2814 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2815 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2816 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2817 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2818 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2819 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2820 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2821 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2822 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2823 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2824 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2825 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2826 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2827 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2828 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2829 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2830 ----------------------------------------------------------------------
2832 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2833 of dimension 1, chars 94, and final character <F>, etc...
2835 Note (*): Although these designations are not allowed in ISO2022,
2836 Emacs accepts them on decoding, and produces them on encoding
2837 CHARS96 character sets in a coding system which is characterized as
2838 7-bit environment, non-locking-shift, and non-single-shift.
2840 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2841 '(' must be omitted. We refer to this as "short-form" hereafter.
2843 Now you may notice that there are a lot of ways of encoding the
2844 same multilingual text in ISO2022. Actually, there exist many
2845 coding systems such as Compound Text (used in X11's inter client
2846 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2847 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2848 localized platforms), and all of these are variants of ISO2022.
2850 In addition to the above, Emacs handles two more kinds of escape
2851 sequences: ISO6429's direction specification and Emacs' private
2852 sequence for specifying character composition.
2854 ISO6429's direction specification takes the following form:
2855 o CSI ']' -- end of the current direction
2856 o CSI '0' ']' -- end of the current direction
2857 o CSI '1' ']' -- start of left-to-right text
2858 o CSI '2' ']' -- start of right-to-left text
2859 The control character CSI (0x9B: control sequence introducer) is
2860 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2862 Character composition specification takes the following form:
2863 o ESC '0' -- start relative composition
2864 o ESC '1' -- end composition
2865 o ESC '2' -- start rule-base composition (*)
2866 o ESC '3' -- start relative composition with alternate chars (**)
2867 o ESC '4' -- start rule-base composition with alternate chars (**)
2868 Since these are not standard escape sequences of any ISO standard,
2869 the use of them with these meanings is restricted to Emacs only.
2871 (*) This form is used only in Emacs 20.7 and older versions,
2872 but newer versions can safely decode it.
2873 (**) This form is used only in Emacs 21.1 and newer versions,
2874 and older versions can't decode it.
2876 Here's a list of example usages of these composition escape
2877 sequences (categorized by `enum composition_method').
2879 COMPOSITION_RELATIVE:
2880 ESC 0 CHAR [ CHAR ] ESC 1
2881 COMPOSITION_WITH_RULE:
2882 ESC 2 CHAR [ RULE CHAR ] ESC 1
2883 COMPOSITION_WITH_ALTCHARS:
2884 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2885 COMPOSITION_WITH_RULE_ALTCHARS:
2886 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2888 static enum iso_code_class_type iso_code_class[256];
2890 #define SAFE_CHARSET_P(coding, id) \
2891 ((id) <= (coding)->max_charset_id \
2892 && (coding)->safe_charsets[id] != 255)
2894 static void
2895 setup_iso_safe_charsets (Lisp_Object attrs)
2897 Lisp_Object charset_list, safe_charsets;
2898 Lisp_Object request;
2899 Lisp_Object reg_usage;
2900 Lisp_Object tail;
2901 EMACS_INT reg94, reg96;
2902 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2903 int max_charset_id;
2905 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2906 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2907 && ! EQ (charset_list, Viso_2022_charset_list))
2909 charset_list = Viso_2022_charset_list;
2910 ASET (attrs, coding_attr_charset_list, charset_list);
2911 ASET (attrs, coding_attr_safe_charsets, Qnil);
2914 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2915 return;
2917 max_charset_id = 0;
2918 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2920 int id = XINT (XCAR (tail));
2921 if (max_charset_id < id)
2922 max_charset_id = id;
2925 safe_charsets = make_uninit_string (max_charset_id + 1);
2926 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2927 request = AREF (attrs, coding_attr_iso_request);
2928 reg_usage = AREF (attrs, coding_attr_iso_usage);
2929 reg94 = XINT (XCAR (reg_usage));
2930 reg96 = XINT (XCDR (reg_usage));
2932 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2934 Lisp_Object id;
2935 Lisp_Object reg;
2936 struct charset *charset;
2938 id = XCAR (tail);
2939 charset = CHARSET_FROM_ID (XINT (id));
2940 reg = Fcdr (Fassq (id, request));
2941 if (! NILP (reg))
2942 SSET (safe_charsets, XINT (id), XINT (reg));
2943 else if (charset->iso_chars_96)
2945 if (reg96 < 4)
2946 SSET (safe_charsets, XINT (id), reg96);
2948 else
2950 if (reg94 < 4)
2951 SSET (safe_charsets, XINT (id), reg94);
2954 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2958 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2959 Return true if a text is encoded in one of ISO-2022 based coding
2960 systems. */
2962 static bool
2963 detect_coding_iso_2022 (struct coding_system *coding,
2964 struct coding_detection_info *detect_info)
2966 const unsigned char *src = coding->source, *src_base = src;
2967 const unsigned char *src_end = coding->source + coding->src_bytes;
2968 bool multibytep = coding->src_multibyte;
2969 bool single_shifting = 0;
2970 int id;
2971 int c, c1;
2972 ptrdiff_t consumed_chars = 0;
2973 int i;
2974 int rejected = 0;
2975 int found = 0;
2976 int composition_count = -1;
2978 detect_info->checked |= CATEGORY_MASK_ISO;
2980 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2982 struct coding_system *this = &(coding_categories[i]);
2983 Lisp_Object attrs, val;
2985 if (this->id < 0)
2986 continue;
2987 attrs = CODING_ID_ATTRS (this->id);
2988 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2989 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2990 setup_iso_safe_charsets (attrs);
2991 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2992 this->max_charset_id = SCHARS (val) - 1;
2993 this->safe_charsets = SDATA (val);
2996 /* A coding system of this category is always ASCII compatible. */
2997 src += coding->head_ascii;
2999 while (rejected != CATEGORY_MASK_ISO)
3001 src_base = src;
3002 ONE_MORE_BYTE (c);
3003 switch (c)
3005 case ISO_CODE_ESC:
3006 if (inhibit_iso_escape_detection)
3007 break;
3008 single_shifting = 0;
3009 ONE_MORE_BYTE (c);
3010 if (c == 'N' || c == 'O')
3012 /* ESC <Fe> for SS2 or SS3. */
3013 single_shifting = 1;
3014 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3016 else if (c == '1')
3018 /* End of composition. */
3019 if (composition_count < 0
3020 || composition_count > MAX_COMPOSITION_COMPONENTS)
3021 /* Invalid */
3022 break;
3023 composition_count = -1;
3024 found |= CATEGORY_MASK_ISO;
3026 else if (c >= '0' && c <= '4')
3028 /* ESC <Fp> for start/end composition. */
3029 composition_count = 0;
3031 else
3033 if (c >= '(' && c <= '/')
3035 /* Designation sequence for a charset of dimension 1. */
3036 ONE_MORE_BYTE (c1);
3037 if (c1 < ' ' || c1 >= 0x80
3038 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3039 /* Invalid designation sequence. Just ignore. */
3040 break;
3042 else if (c == '$')
3044 /* Designation sequence for a charset of dimension 2. */
3045 ONE_MORE_BYTE (c);
3046 if (c >= '@' && c <= 'B')
3047 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3048 id = iso_charset_table[1][0][c];
3049 else if (c >= '(' && c <= '/')
3051 ONE_MORE_BYTE (c1);
3052 if (c1 < ' ' || c1 >= 0x80
3053 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3054 /* Invalid designation sequence. Just ignore. */
3055 break;
3057 else
3058 /* Invalid designation sequence. Just ignore it. */
3059 break;
3061 else
3063 /* Invalid escape sequence. Just ignore it. */
3064 break;
3067 /* We found a valid designation sequence for CHARSET. */
3068 rejected |= CATEGORY_MASK_ISO_8BIT;
3069 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3070 id))
3071 found |= CATEGORY_MASK_ISO_7;
3072 else
3073 rejected |= CATEGORY_MASK_ISO_7;
3074 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3075 id))
3076 found |= CATEGORY_MASK_ISO_7_TIGHT;
3077 else
3078 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3079 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3080 id))
3081 found |= CATEGORY_MASK_ISO_7_ELSE;
3082 else
3083 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3084 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3085 id))
3086 found |= CATEGORY_MASK_ISO_8_ELSE;
3087 else
3088 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3090 break;
3092 case ISO_CODE_SO:
3093 case ISO_CODE_SI:
3094 /* Locking shift out/in. */
3095 if (inhibit_iso_escape_detection)
3096 break;
3097 single_shifting = 0;
3098 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3099 break;
3101 case ISO_CODE_CSI:
3102 /* Control sequence introducer. */
3103 single_shifting = 0;
3104 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3105 found |= CATEGORY_MASK_ISO_8_ELSE;
3106 goto check_extra_latin;
3108 case ISO_CODE_SS2:
3109 case ISO_CODE_SS3:
3110 /* Single shift. */
3111 if (inhibit_iso_escape_detection)
3112 break;
3113 single_shifting = 0;
3114 rejected |= CATEGORY_MASK_ISO_7BIT;
3115 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116 & CODING_ISO_FLAG_SINGLE_SHIFT)
3118 found |= CATEGORY_MASK_ISO_8_1;
3119 single_shifting = 1;
3121 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3122 & CODING_ISO_FLAG_SINGLE_SHIFT)
3124 found |= CATEGORY_MASK_ISO_8_2;
3125 single_shifting = 1;
3127 if (single_shifting)
3128 break;
3129 goto check_extra_latin;
3131 default:
3132 if (c < 0)
3133 continue;
3134 if (c < 0x80)
3136 if (composition_count >= 0)
3137 composition_count++;
3138 single_shifting = 0;
3139 break;
3141 if (c >= 0xA0)
3143 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3144 found |= CATEGORY_MASK_ISO_8_1;
3145 /* Check the length of succeeding codes of the range
3146 0xA0..0FF. If the byte length is even, we include
3147 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3148 only when we are not single shifting. */
3149 if (! single_shifting
3150 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3152 int len = 1;
3153 while (src < src_end)
3155 src_base = src;
3156 ONE_MORE_BYTE (c);
3157 if (c < 0xA0)
3159 src = src_base;
3160 break;
3162 len++;
3165 if (len & 1 && src < src_end)
3167 rejected |= CATEGORY_MASK_ISO_8_2;
3168 if (composition_count >= 0)
3169 composition_count += len;
3171 else
3173 found |= CATEGORY_MASK_ISO_8_2;
3174 if (composition_count >= 0)
3175 composition_count += len / 2;
3178 break;
3180 check_extra_latin:
3181 if (! VECTORP (Vlatin_extra_code_table)
3182 || NILP (AREF (Vlatin_extra_code_table, c)))
3184 rejected = CATEGORY_MASK_ISO;
3185 break;
3187 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3188 & CODING_ISO_FLAG_LATIN_EXTRA)
3189 found |= CATEGORY_MASK_ISO_8_1;
3190 else
3191 rejected |= CATEGORY_MASK_ISO_8_1;
3192 rejected |= CATEGORY_MASK_ISO_8_2;
3193 break;
3196 detect_info->rejected |= CATEGORY_MASK_ISO;
3197 return 0;
3199 no_more_source:
3200 detect_info->rejected |= rejected;
3201 detect_info->found |= (found & ~rejected);
3202 return 1;
3206 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3207 escape sequence should be kept. */
3208 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3209 do { \
3210 int id, prev; \
3212 if (final < '0' || final >= 128 \
3213 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3214 || !SAFE_CHARSET_P (coding, id)) \
3216 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3217 chars_96 = -1; \
3218 break; \
3220 prev = CODING_ISO_DESIGNATION (coding, reg); \
3221 if (id == charset_jisx0201_roman) \
3223 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3224 id = charset_ascii; \
3226 else if (id == charset_jisx0208_1978) \
3228 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3229 id = charset_jisx0208; \
3231 CODING_ISO_DESIGNATION (coding, reg) = id; \
3232 /* If there was an invalid designation to REG previously, and this \
3233 designation is ASCII to REG, we should keep this designation \
3234 sequence. */ \
3235 if (prev == -2 && id == charset_ascii) \
3236 chars_96 = -1; \
3237 } while (0)
3240 /* Handle these composition sequence (ALT: alternate char):
3242 (1) relative composition: ESC 0 CHAR ... ESC 1
3243 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3244 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3245 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3247 When the start sequence (ESC 0/2/3/4) is found, this annotation
3248 header is produced.
3250 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3252 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3253 produced until the end sequence (ESC 1) is found:
3255 (1) CHAR ... CHAR
3256 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3257 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3258 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3260 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3261 annotation header is updated as below:
3263 (1) LENGTH: unchanged, NCHARS: number of CHARs
3264 (2) LENGTH: unchanged, NCHARS: number of CHARs
3265 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3266 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3268 If an error is found while composing, the annotation header is
3269 changed to:
3271 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3273 and the sequence [ -2 DECODED-RULE ] is changed to the original
3274 byte sequence as below:
3275 o the original byte sequence is B: [ B -1 ]
3276 o the original byte sequence is B1 B2: [ B1 B2 ]
3277 and the sequence [ -1 -1 ] is changed to the original byte
3278 sequence:
3279 [ ESC '0' ]
3282 /* Decode a composition rule C1 and maybe one more byte from the
3283 source, and set RULE to the encoded composition rule. If the rule
3284 is invalid, goto invalid_code. */
3286 #define DECODE_COMPOSITION_RULE(rule) \
3287 do { \
3288 rule = c1 - 32; \
3289 if (rule < 0) \
3290 goto invalid_code; \
3291 if (rule < 81) /* old format (before ver.21) */ \
3293 int gref = (rule) / 9; \
3294 int nref = (rule) % 9; \
3295 if (gref == 4) gref = 10; \
3296 if (nref == 4) nref = 10; \
3297 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3299 else /* new format (after ver.21) */ \
3301 int b; \
3303 ONE_MORE_BYTE (b); \
3304 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3305 goto invalid_code; \
3306 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3307 rule += 0x100; /* Distinguish it from the old format. */ \
3309 } while (0)
3311 #define ENCODE_COMPOSITION_RULE(rule) \
3312 do { \
3313 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3315 if (rule < 0x100) /* old format */ \
3317 if (gref == 10) gref = 4; \
3318 if (nref == 10) nref = 4; \
3319 charbuf[idx] = 32 + gref * 9 + nref; \
3320 charbuf[idx + 1] = -1; \
3321 new_chars++; \
3323 else /* new format */ \
3325 charbuf[idx] = 32 + 81 + gref; \
3326 charbuf[idx + 1] = 32 + nref; \
3327 new_chars += 2; \
3329 } while (0)
3331 /* Finish the current composition as invalid. */
3333 static int
3334 finish_composition (int *charbuf, struct composition_status *cmp_status)
3336 int idx = - cmp_status->length;
3337 int new_chars;
3339 /* Recover the original ESC sequence */
3340 charbuf[idx++] = ISO_CODE_ESC;
3341 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3342 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3343 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3344 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3345 : '4');
3346 charbuf[idx++] = -2;
3347 charbuf[idx++] = 0;
3348 charbuf[idx++] = -1;
3349 new_chars = cmp_status->nchars;
3350 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3351 for (; idx < 0; idx++)
3353 int elt = charbuf[idx];
3355 if (elt == -2)
3357 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3358 idx++;
3360 else if (elt == -1)
3362 charbuf[idx++] = ISO_CODE_ESC;
3363 charbuf[idx] = '0';
3364 new_chars += 2;
3367 cmp_status->state = COMPOSING_NO;
3368 return new_chars;
3371 /* If characters are under composition, finish the composition. */
3372 #define MAYBE_FINISH_COMPOSITION() \
3373 do { \
3374 if (cmp_status->state != COMPOSING_NO) \
3375 char_offset += finish_composition (charbuf, cmp_status); \
3376 } while (0)
3378 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3380 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3381 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3382 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3383 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3385 Produce this annotation sequence now:
3387 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3390 #define DECODE_COMPOSITION_START(c1) \
3391 do { \
3392 if (c1 == '0' \
3393 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3394 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3395 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3396 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3398 *charbuf++ = -1; \
3399 *charbuf++= -1; \
3400 cmp_status->state = COMPOSING_CHAR; \
3401 cmp_status->length += 2; \
3403 else \
3405 MAYBE_FINISH_COMPOSITION (); \
3406 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3407 : c1 == '2' ? COMPOSITION_WITH_RULE \
3408 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3409 : COMPOSITION_WITH_RULE_ALTCHARS); \
3410 cmp_status->state \
3411 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3412 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3413 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3414 cmp_status->nchars = cmp_status->ncomps = 0; \
3415 coding->annotated = 1; \
3417 } while (0)
3420 /* Handle composition end sequence ESC 1. */
3422 #define DECODE_COMPOSITION_END() \
3423 do { \
3424 if (cmp_status->nchars == 0 \
3425 || ((cmp_status->state == COMPOSING_CHAR) \
3426 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3428 MAYBE_FINISH_COMPOSITION (); \
3429 goto invalid_code; \
3431 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3432 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3433 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3434 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3435 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3436 char_offset += cmp_status->nchars; \
3437 cmp_status->state = COMPOSING_NO; \
3438 } while (0)
3440 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3442 #define STORE_COMPOSITION_RULE(rule) \
3443 do { \
3444 *charbuf++ = -2; \
3445 *charbuf++ = rule; \
3446 cmp_status->length += 2; \
3447 cmp_status->state--; \
3448 } while (0)
3450 /* Store a composed char or a component char C in charbuf, and update
3451 cmp_status. */
3453 #define STORE_COMPOSITION_CHAR(c) \
3454 do { \
3455 *charbuf++ = (c); \
3456 cmp_status->length++; \
3457 if (cmp_status->state == COMPOSING_CHAR) \
3458 cmp_status->nchars++; \
3459 else \
3460 cmp_status->ncomps++; \
3461 if (cmp_status->method == COMPOSITION_WITH_RULE \
3462 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3463 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3464 cmp_status->state++; \
3465 } while (0)
3468 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3470 static void
3471 decode_coding_iso_2022 (struct coding_system *coding)
3473 const unsigned char *src = coding->source + coding->consumed;
3474 const unsigned char *src_end = coding->source + coding->src_bytes;
3475 const unsigned char *src_base;
3476 int *charbuf = coding->charbuf + coding->charbuf_used;
3477 /* We may produce two annotations (charset and composition) in one
3478 loop and one more charset annotation at the end. */
3479 int *charbuf_end
3480 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3481 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3482 bool multibytep = coding->src_multibyte;
3483 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3484 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3485 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3486 int charset_id_2, charset_id_3;
3487 struct charset *charset;
3488 int c;
3489 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3490 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3491 ptrdiff_t char_offset = coding->produced_char;
3492 ptrdiff_t last_offset = char_offset;
3493 int last_id = charset_ascii;
3494 bool eol_dos
3495 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3496 int byte_after_cr = -1;
3497 int i;
3499 setup_iso_safe_charsets (attrs);
3500 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3502 if (cmp_status->state != COMPOSING_NO)
3504 if (charbuf_end - charbuf < cmp_status->length)
3505 emacs_abort ();
3506 for (i = 0; i < cmp_status->length; i++)
3507 *charbuf++ = cmp_status->carryover[i];
3508 coding->annotated = 1;
3511 while (1)
3513 int c1, c2, c3;
3515 src_base = src;
3516 consumed_chars_base = consumed_chars;
3518 if (charbuf >= charbuf_end)
3520 if (byte_after_cr >= 0)
3521 src_base--;
3522 break;
3525 if (byte_after_cr >= 0)
3526 c1 = byte_after_cr, byte_after_cr = -1;
3527 else
3528 ONE_MORE_BYTE (c1);
3529 if (c1 < 0)
3530 goto invalid_code;
3532 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3534 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3535 char_offset++;
3536 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3537 continue;
3540 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3542 if (c1 == ISO_CODE_ESC)
3544 if (src + 1 >= src_end)
3545 goto no_more_source;
3546 *charbuf++ = ISO_CODE_ESC;
3547 char_offset++;
3548 if (src[0] == '%' && src[1] == '@')
3550 src += 2;
3551 consumed_chars += 2;
3552 char_offset += 2;
3553 /* We are sure charbuf can contain two more chars. */
3554 *charbuf++ = '%';
3555 *charbuf++ = '@';
3556 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3559 else
3561 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3562 char_offset++;
3564 continue;
3567 if ((cmp_status->state == COMPOSING_RULE
3568 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3569 && c1 != ISO_CODE_ESC)
3571 int rule;
3573 DECODE_COMPOSITION_RULE (rule);
3574 STORE_COMPOSITION_RULE (rule);
3575 continue;
3578 /* We produce at most one character. */
3579 switch (iso_code_class [c1])
3581 case ISO_0x20_or_0x7F:
3582 if (charset_id_0 < 0
3583 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3584 /* This is SPACE or DEL. */
3585 charset = CHARSET_FROM_ID (charset_ascii);
3586 else
3587 charset = CHARSET_FROM_ID (charset_id_0);
3588 break;
3590 case ISO_graphic_plane_0:
3591 if (charset_id_0 < 0)
3592 charset = CHARSET_FROM_ID (charset_ascii);
3593 else
3594 charset = CHARSET_FROM_ID (charset_id_0);
3595 break;
3597 case ISO_0xA0_or_0xFF:
3598 if (charset_id_1 < 0
3599 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3600 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3601 goto invalid_code;
3602 /* This is a graphic character, we fall down ... */
3604 case ISO_graphic_plane_1:
3605 if (charset_id_1 < 0)
3606 goto invalid_code;
3607 charset = CHARSET_FROM_ID (charset_id_1);
3608 break;
3610 case ISO_control_0:
3611 if (eol_dos && c1 == '\r')
3612 ONE_MORE_BYTE (byte_after_cr);
3613 MAYBE_FINISH_COMPOSITION ();
3614 charset = CHARSET_FROM_ID (charset_ascii);
3615 break;
3617 case ISO_control_1:
3618 goto invalid_code;
3620 case ISO_shift_out:
3621 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3622 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3623 goto invalid_code;
3624 CODING_ISO_INVOCATION (coding, 0) = 1;
3625 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3626 continue;
3628 case ISO_shift_in:
3629 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3630 goto invalid_code;
3631 CODING_ISO_INVOCATION (coding, 0) = 0;
3632 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633 continue;
3635 case ISO_single_shift_2_7:
3636 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3637 goto invalid_code;
3638 case ISO_single_shift_2:
3639 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3640 goto invalid_code;
3641 /* SS2 is handled as an escape sequence of ESC 'N' */
3642 c1 = 'N';
3643 goto label_escape_sequence;
3645 case ISO_single_shift_3:
3646 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3647 goto invalid_code;
3648 /* SS2 is handled as an escape sequence of ESC 'O' */
3649 c1 = 'O';
3650 goto label_escape_sequence;
3652 case ISO_control_sequence_introducer:
3653 /* CSI is handled as an escape sequence of ESC '[' ... */
3654 c1 = '[';
3655 goto label_escape_sequence;
3657 case ISO_escape:
3658 ONE_MORE_BYTE (c1);
3659 label_escape_sequence:
3660 /* Escape sequences handled here are invocation,
3661 designation, direction specification, and character
3662 composition specification. */
3663 switch (c1)
3665 case '&': /* revision of following character set */
3666 ONE_MORE_BYTE (c1);
3667 if (!(c1 >= '@' && c1 <= '~'))
3668 goto invalid_code;
3669 ONE_MORE_BYTE (c1);
3670 if (c1 != ISO_CODE_ESC)
3671 goto invalid_code;
3672 ONE_MORE_BYTE (c1);
3673 goto label_escape_sequence;
3675 case '$': /* designation of 2-byte character set */
3676 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3677 goto invalid_code;
3679 int reg, chars96;
3681 ONE_MORE_BYTE (c1);
3682 if (c1 >= '@' && c1 <= 'B')
3683 { /* designation of JISX0208.1978, GB2312.1980,
3684 or JISX0208.1980 */
3685 reg = 0, chars96 = 0;
3687 else if (c1 >= 0x28 && c1 <= 0x2B)
3688 { /* designation of DIMENSION2_CHARS94 character set */
3689 reg = c1 - 0x28, chars96 = 0;
3690 ONE_MORE_BYTE (c1);
3692 else if (c1 >= 0x2C && c1 <= 0x2F)
3693 { /* designation of DIMENSION2_CHARS96 character set */
3694 reg = c1 - 0x2C, chars96 = 1;
3695 ONE_MORE_BYTE (c1);
3697 else
3698 goto invalid_code;
3699 DECODE_DESIGNATION (reg, 2, chars96, c1);
3700 /* We must update these variables now. */
3701 if (reg == 0)
3702 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703 else if (reg == 1)
3704 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3705 if (chars96 < 0)
3706 goto invalid_code;
3708 continue;
3710 case 'n': /* invocation of locking-shift-2 */
3711 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713 goto invalid_code;
3714 CODING_ISO_INVOCATION (coding, 0) = 2;
3715 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716 continue;
3718 case 'o': /* invocation of locking-shift-3 */
3719 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3721 goto invalid_code;
3722 CODING_ISO_INVOCATION (coding, 0) = 3;
3723 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724 continue;
3726 case 'N': /* invocation of single-shift-2 */
3727 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3728 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3729 goto invalid_code;
3730 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3731 if (charset_id_2 < 0)
3732 charset = CHARSET_FROM_ID (charset_ascii);
3733 else
3734 charset = CHARSET_FROM_ID (charset_id_2);
3735 ONE_MORE_BYTE (c1);
3736 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3737 goto invalid_code;
3738 break;
3740 case 'O': /* invocation of single-shift-3 */
3741 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3742 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3743 goto invalid_code;
3744 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3745 if (charset_id_3 < 0)
3746 charset = CHARSET_FROM_ID (charset_ascii);
3747 else
3748 charset = CHARSET_FROM_ID (charset_id_3);
3749 ONE_MORE_BYTE (c1);
3750 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3751 goto invalid_code;
3752 break;
3754 case '0': case '2': case '3': case '4': /* start composition */
3755 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3756 goto invalid_code;
3757 if (last_id != charset_ascii)
3759 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3760 last_id = charset_ascii;
3761 last_offset = char_offset;
3763 DECODE_COMPOSITION_START (c1);
3764 continue;
3766 case '1': /* end composition */
3767 if (cmp_status->state == COMPOSING_NO)
3768 goto invalid_code;
3769 DECODE_COMPOSITION_END ();
3770 continue;
3772 case '[': /* specification of direction */
3773 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3774 goto invalid_code;
3775 /* For the moment, nested direction is not supported.
3776 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3777 left-to-right, and nonzero means right-to-left. */
3778 ONE_MORE_BYTE (c1);
3779 switch (c1)
3781 case ']': /* end of the current direction */
3782 coding->mode &= ~CODING_MODE_DIRECTION;
3784 case '0': /* end of the current direction */
3785 case '1': /* start of left-to-right direction */
3786 ONE_MORE_BYTE (c1);
3787 if (c1 == ']')
3788 coding->mode &= ~CODING_MODE_DIRECTION;
3789 else
3790 goto invalid_code;
3791 break;
3793 case '2': /* start of right-to-left direction */
3794 ONE_MORE_BYTE (c1);
3795 if (c1 == ']')
3796 coding->mode |= CODING_MODE_DIRECTION;
3797 else
3798 goto invalid_code;
3799 break;
3801 default:
3802 goto invalid_code;
3804 continue;
3806 case '%':
3807 ONE_MORE_BYTE (c1);
3808 if (c1 == '/')
3810 /* CTEXT extended segment:
3811 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3812 We keep these bytes as is for the moment.
3813 They may be decoded by post-read-conversion. */
3814 int dim, M, L;
3815 int size;
3817 ONE_MORE_BYTE (dim);
3818 if (dim < '0' || dim > '4')
3819 goto invalid_code;
3820 ONE_MORE_BYTE (M);
3821 if (M < 128)
3822 goto invalid_code;
3823 ONE_MORE_BYTE (L);
3824 if (L < 128)
3825 goto invalid_code;
3826 size = ((M - 128) * 128) + (L - 128);
3827 if (charbuf + 6 > charbuf_end)
3828 goto break_loop;
3829 *charbuf++ = ISO_CODE_ESC;
3830 *charbuf++ = '%';
3831 *charbuf++ = '/';
3832 *charbuf++ = dim;
3833 *charbuf++ = BYTE8_TO_CHAR (M);
3834 *charbuf++ = BYTE8_TO_CHAR (L);
3835 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3837 else if (c1 == 'G')
3839 /* XFree86 extension for embedding UTF-8 in CTEXT:
3840 ESC % G --UTF-8-BYTES-- ESC % @
3841 We keep these bytes as is for the moment.
3842 They may be decoded by post-read-conversion. */
3843 if (charbuf + 3 > charbuf_end)
3844 goto break_loop;
3845 *charbuf++ = ISO_CODE_ESC;
3846 *charbuf++ = '%';
3847 *charbuf++ = 'G';
3848 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3850 else
3851 goto invalid_code;
3852 continue;
3853 break;
3855 default:
3856 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3857 goto invalid_code;
3859 int reg, chars96;
3861 if (c1 >= 0x28 && c1 <= 0x2B)
3862 { /* designation of DIMENSION1_CHARS94 character set */
3863 reg = c1 - 0x28, chars96 = 0;
3864 ONE_MORE_BYTE (c1);
3866 else if (c1 >= 0x2C && c1 <= 0x2F)
3867 { /* designation of DIMENSION1_CHARS96 character set */
3868 reg = c1 - 0x2C, chars96 = 1;
3869 ONE_MORE_BYTE (c1);
3871 else
3872 goto invalid_code;
3873 DECODE_DESIGNATION (reg, 1, chars96, c1);
3874 /* We must update these variables now. */
3875 if (reg == 0)
3876 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3877 else if (reg == 1)
3878 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3879 if (chars96 < 0)
3880 goto invalid_code;
3882 continue;
3884 break;
3886 default:
3887 emacs_abort ();
3890 if (cmp_status->state == COMPOSING_NO
3891 && charset->id != charset_ascii
3892 && last_id != charset->id)
3894 if (last_id != charset_ascii)
3895 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3896 last_id = charset->id;
3897 last_offset = char_offset;
3900 /* Now we know CHARSET and 1st position code C1 of a character.
3901 Produce a decoded character while getting 2nd and 3rd
3902 position codes C2, C3 if necessary. */
3903 if (CHARSET_DIMENSION (charset) > 1)
3905 ONE_MORE_BYTE (c2);
3906 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3907 || ((c1 & 0x80) != (c2 & 0x80)))
3908 /* C2 is not in a valid range. */
3909 goto invalid_code;
3910 if (CHARSET_DIMENSION (charset) == 2)
3911 c1 = (c1 << 8) | c2;
3912 else
3914 ONE_MORE_BYTE (c3);
3915 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3916 || ((c1 & 0x80) != (c3 & 0x80)))
3917 /* C3 is not in a valid range. */
3918 goto invalid_code;
3919 c1 = (c1 << 16) | (c2 << 8) | c2;
3922 c1 &= 0x7F7F7F;
3923 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3924 if (c < 0)
3926 MAYBE_FINISH_COMPOSITION ();
3927 for (; src_base < src; src_base++, char_offset++)
3929 if (ASCII_BYTE_P (*src_base))
3930 *charbuf++ = *src_base;
3931 else
3932 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3935 else if (cmp_status->state == COMPOSING_NO)
3937 *charbuf++ = c;
3938 char_offset++;
3940 else if ((cmp_status->state == COMPOSING_CHAR
3941 ? cmp_status->nchars
3942 : cmp_status->ncomps)
3943 >= MAX_COMPOSITION_COMPONENTS)
3945 /* Too long composition. */
3946 MAYBE_FINISH_COMPOSITION ();
3947 *charbuf++ = c;
3948 char_offset++;
3950 else
3951 STORE_COMPOSITION_CHAR (c);
3952 continue;
3954 invalid_code:
3955 MAYBE_FINISH_COMPOSITION ();
3956 src = src_base;
3957 consumed_chars = consumed_chars_base;
3958 ONE_MORE_BYTE (c);
3959 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3960 char_offset++;
3961 coding->errors++;
3962 /* Reset the invocation and designation status to the safest
3963 one; i.e. designate ASCII to the graphic register 0, and
3964 invoke that register to the graphic plane 0. This typically
3965 helps the case that an designation sequence for ASCII "ESC (
3966 B" is somehow broken (e.g. broken by a newline). */
3967 CODING_ISO_INVOCATION (coding, 0) = 0;
3968 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3969 charset_id_0 = charset_ascii;
3970 continue;
3972 break_loop:
3973 break;
3976 no_more_source:
3977 if (cmp_status->state != COMPOSING_NO)
3979 if (coding->mode & CODING_MODE_LAST_BLOCK)
3980 MAYBE_FINISH_COMPOSITION ();
3981 else
3983 charbuf -= cmp_status->length;
3984 for (i = 0; i < cmp_status->length; i++)
3985 cmp_status->carryover[i] = charbuf[i];
3988 else if (last_id != charset_ascii)
3989 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3990 coding->consumed_char += consumed_chars_base;
3991 coding->consumed = src_base - coding->source;
3992 coding->charbuf_used = charbuf - coding->charbuf;
3996 /* ISO2022 encoding stuff. */
3999 It is not enough to say just "ISO2022" on encoding, we have to
4000 specify more details. In Emacs, each coding system of ISO2022
4001 variant has the following specifications:
4002 1. Initial designation to G0 thru G3.
4003 2. Allows short-form designation?
4004 3. ASCII should be designated to G0 before control characters?
4005 4. ASCII should be designated to G0 at end of line?
4006 5. 7-bit environment or 8-bit environment?
4007 6. Use locking-shift?
4008 7. Use Single-shift?
4009 And the following two are only for Japanese:
4010 8. Use ASCII in place of JIS0201-1976-Roman?
4011 9. Use JISX0208-1983 in place of JISX0208-1978?
4012 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4013 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4014 details.
4017 /* Produce codes (escape sequence) for designating CHARSET to graphic
4018 register REG at DST, and increment DST. If <final-char> of CHARSET is
4019 '@', 'A', or 'B' and the coding system CODING allows, produce
4020 designation sequence of short-form. */
4022 #define ENCODE_DESIGNATION(charset, reg, coding) \
4023 do { \
4024 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4025 const char *intermediate_char_94 = "()*+"; \
4026 const char *intermediate_char_96 = ",-./"; \
4027 int revision = -1; \
4029 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4030 revision = CHARSET_ISO_REVISION (charset); \
4032 if (revision >= 0) \
4034 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4035 EMIT_ONE_BYTE ('@' + revision); \
4037 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4038 if (CHARSET_DIMENSION (charset) == 1) \
4040 int b; \
4041 if (! CHARSET_ISO_CHARS_96 (charset)) \
4042 b = intermediate_char_94[reg]; \
4043 else \
4044 b = intermediate_char_96[reg]; \
4045 EMIT_ONE_ASCII_BYTE (b); \
4047 else \
4049 EMIT_ONE_ASCII_BYTE ('$'); \
4050 if (! CHARSET_ISO_CHARS_96 (charset)) \
4052 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4053 || reg != 0 \
4054 || final_char < '@' || final_char > 'B') \
4055 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4057 else \
4058 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4060 EMIT_ONE_ASCII_BYTE (final_char); \
4062 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4063 } while (0)
4066 /* The following two macros produce codes (control character or escape
4067 sequence) for ISO2022 single-shift functions (single-shift-2 and
4068 single-shift-3). */
4070 #define ENCODE_SINGLE_SHIFT_2 \
4071 do { \
4072 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4073 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4074 else \
4075 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4076 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4077 } while (0)
4080 #define ENCODE_SINGLE_SHIFT_3 \
4081 do { \
4082 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4083 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4084 else \
4085 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4086 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4087 } while (0)
4090 /* The following four macros produce codes (control character or
4091 escape sequence) for ISO2022 locking-shift functions (shift-in,
4092 shift-out, locking-shift-2, and locking-shift-3). */
4094 #define ENCODE_SHIFT_IN \
4095 do { \
4096 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4097 CODING_ISO_INVOCATION (coding, 0) = 0; \
4098 } while (0)
4101 #define ENCODE_SHIFT_OUT \
4102 do { \
4103 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4104 CODING_ISO_INVOCATION (coding, 0) = 1; \
4105 } while (0)
4108 #define ENCODE_LOCKING_SHIFT_2 \
4109 do { \
4110 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4111 CODING_ISO_INVOCATION (coding, 0) = 2; \
4112 } while (0)
4115 #define ENCODE_LOCKING_SHIFT_3 \
4116 do { \
4117 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4118 CODING_ISO_INVOCATION (coding, 0) = 3; \
4119 } while (0)
4122 /* Produce codes for a DIMENSION1 character whose character set is
4123 CHARSET and whose position-code is C1. Designation and invocation
4124 sequences are also produced in advance if necessary. */
4126 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4127 do { \
4128 int id = CHARSET_ID (charset); \
4130 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4131 && id == charset_ascii) \
4133 id = charset_jisx0201_roman; \
4134 charset = CHARSET_FROM_ID (id); \
4137 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4139 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4140 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4141 else \
4142 EMIT_ONE_BYTE (c1 | 0x80); \
4143 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4144 break; \
4146 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4148 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4149 break; \
4151 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4153 EMIT_ONE_BYTE (c1 | 0x80); \
4154 break; \
4156 else \
4157 /* Since CHARSET is not yet invoked to any graphic planes, we \
4158 must invoke it, or, at first, designate it to some graphic \
4159 register. Then repeat the loop to actually produce the \
4160 character. */ \
4161 dst = encode_invocation_designation (charset, coding, dst, \
4162 &produced_chars); \
4163 } while (1)
4166 /* Produce codes for a DIMENSION2 character whose character set is
4167 CHARSET and whose position-codes are C1 and C2. Designation and
4168 invocation codes are also produced in advance if necessary. */
4170 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4171 do { \
4172 int id = CHARSET_ID (charset); \
4174 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4175 && id == charset_jisx0208) \
4177 id = charset_jisx0208_1978; \
4178 charset = CHARSET_FROM_ID (id); \
4181 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4183 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4184 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4185 else \
4186 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4187 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4188 break; \
4190 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4192 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4193 break; \
4195 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4197 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4198 break; \
4200 else \
4201 /* Since CHARSET is not yet invoked to any graphic planes, we \
4202 must invoke it, or, at first, designate it to some graphic \
4203 register. Then repeat the loop to actually produce the \
4204 character. */ \
4205 dst = encode_invocation_designation (charset, coding, dst, \
4206 &produced_chars); \
4207 } while (1)
4210 #define ENCODE_ISO_CHARACTER(charset, c) \
4211 do { \
4212 unsigned code; \
4213 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4215 if (CHARSET_DIMENSION (charset) == 1) \
4216 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4217 else \
4218 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4219 } while (0)
4222 /* Produce designation and invocation codes at a place pointed by DST
4223 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4224 Return new DST. */
4226 static unsigned char *
4227 encode_invocation_designation (struct charset *charset,
4228 struct coding_system *coding,
4229 unsigned char *dst, ptrdiff_t *p_nchars)
4231 bool multibytep = coding->dst_multibyte;
4232 ptrdiff_t produced_chars = *p_nchars;
4233 int reg; /* graphic register number */
4234 int id = CHARSET_ID (charset);
4236 /* At first, check designations. */
4237 for (reg = 0; reg < 4; reg++)
4238 if (id == CODING_ISO_DESIGNATION (coding, reg))
4239 break;
4241 if (reg >= 4)
4243 /* CHARSET is not yet designated to any graphic registers. */
4244 /* At first check the requested designation. */
4245 reg = CODING_ISO_REQUEST (coding, id);
4246 if (reg < 0)
4247 /* Since CHARSET requests no special designation, designate it
4248 to graphic register 0. */
4249 reg = 0;
4251 ENCODE_DESIGNATION (charset, reg, coding);
4254 if (CODING_ISO_INVOCATION (coding, 0) != reg
4255 && CODING_ISO_INVOCATION (coding, 1) != reg)
4257 /* Since the graphic register REG is not invoked to any graphic
4258 planes, invoke it to graphic plane 0. */
4259 switch (reg)
4261 case 0: /* graphic register 0 */
4262 ENCODE_SHIFT_IN;
4263 break;
4265 case 1: /* graphic register 1 */
4266 ENCODE_SHIFT_OUT;
4267 break;
4269 case 2: /* graphic register 2 */
4270 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4271 ENCODE_SINGLE_SHIFT_2;
4272 else
4273 ENCODE_LOCKING_SHIFT_2;
4274 break;
4276 case 3: /* graphic register 3 */
4277 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4278 ENCODE_SINGLE_SHIFT_3;
4279 else
4280 ENCODE_LOCKING_SHIFT_3;
4281 break;
4285 *p_nchars = produced_chars;
4286 return dst;
4290 /* Produce codes for designation and invocation to reset the graphic
4291 planes and registers to initial state. */
4292 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4293 do { \
4294 int reg; \
4295 struct charset *charset; \
4297 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4298 ENCODE_SHIFT_IN; \
4299 for (reg = 0; reg < 4; reg++) \
4300 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4301 && (CODING_ISO_DESIGNATION (coding, reg) \
4302 != CODING_ISO_INITIAL (coding, reg))) \
4304 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4305 ENCODE_DESIGNATION (charset, reg, coding); \
4307 } while (0)
4310 /* Produce designation sequences of charsets in the line started from
4311 CHARBUF to a place pointed by DST, and return the number of
4312 produced bytes. DST should not directly point a buffer text area
4313 which may be relocated by char_charset call.
4315 If the current block ends before any end-of-line, we may fail to
4316 find all the necessary designations. */
4318 static ptrdiff_t
4319 encode_designation_at_bol (struct coding_system *coding,
4320 int *charbuf, int *charbuf_end,
4321 unsigned char *dst)
4323 unsigned char *orig = dst;
4324 struct charset *charset;
4325 /* Table of charsets to be designated to each graphic register. */
4326 int r[4];
4327 int c, found = 0, reg;
4328 ptrdiff_t produced_chars = 0;
4329 bool multibytep = coding->dst_multibyte;
4330 Lisp_Object attrs;
4331 Lisp_Object charset_list;
4333 attrs = CODING_ID_ATTRS (coding->id);
4334 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4335 if (EQ (charset_list, Qiso_2022))
4336 charset_list = Viso_2022_charset_list;
4338 for (reg = 0; reg < 4; reg++)
4339 r[reg] = -1;
4341 while (charbuf < charbuf_end && found < 4)
4343 int id;
4345 c = *charbuf++;
4346 if (c == '\n')
4347 break;
4348 charset = char_charset (c, charset_list, NULL);
4349 id = CHARSET_ID (charset);
4350 reg = CODING_ISO_REQUEST (coding, id);
4351 if (reg >= 0 && r[reg] < 0)
4353 found++;
4354 r[reg] = id;
4358 if (found)
4360 for (reg = 0; reg < 4; reg++)
4361 if (r[reg] >= 0
4362 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4363 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4366 return dst - orig;
4369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4371 static bool
4372 encode_coding_iso_2022 (struct coding_system *coding)
4374 bool multibytep = coding->dst_multibyte;
4375 int *charbuf = coding->charbuf;
4376 int *charbuf_end = charbuf + coding->charbuf_used;
4377 unsigned char *dst = coding->destination + coding->produced;
4378 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4379 int safe_room = 16;
4380 bool bol_designation
4381 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4382 && CODING_ISO_BOL (coding));
4383 ptrdiff_t produced_chars = 0;
4384 Lisp_Object attrs, eol_type, charset_list;
4385 bool ascii_compatible;
4386 int c;
4387 int preferred_charset_id = -1;
4389 CODING_GET_INFO (coding, attrs, charset_list);
4390 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4391 if (VECTORP (eol_type))
4392 eol_type = Qunix;
4394 setup_iso_safe_charsets (attrs);
4395 /* Charset list may have been changed. */
4396 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4397 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4399 ascii_compatible
4400 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4401 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4402 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4404 while (charbuf < charbuf_end)
4406 ASSURE_DESTINATION (safe_room);
4408 if (bol_designation)
4410 /* We have to produce designation sequences if any now. */
4411 unsigned char desig_buf[16];
4412 int nbytes;
4413 ptrdiff_t offset;
4415 charset_map_loaded = 0;
4416 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4417 desig_buf);
4418 if (charset_map_loaded
4419 && (offset = coding_change_destination (coding)))
4421 dst += offset;
4422 dst_end += offset;
4424 memcpy (dst, desig_buf, nbytes);
4425 dst += nbytes;
4426 /* We are sure that designation sequences are all ASCII bytes. */
4427 produced_chars += nbytes;
4428 bol_designation = 0;
4429 ASSURE_DESTINATION (safe_room);
4432 c = *charbuf++;
4434 if (c < 0)
4436 /* Handle an annotation. */
4437 switch (*charbuf)
4439 case CODING_ANNOTATE_COMPOSITION_MASK:
4440 /* Not yet implemented. */
4441 break;
4442 case CODING_ANNOTATE_CHARSET_MASK:
4443 preferred_charset_id = charbuf[2];
4444 if (preferred_charset_id >= 0
4445 && NILP (Fmemq (make_number (preferred_charset_id),
4446 charset_list)))
4447 preferred_charset_id = -1;
4448 break;
4449 default:
4450 emacs_abort ();
4452 charbuf += -c - 1;
4453 continue;
4456 /* Now encode the character C. */
4457 if (c < 0x20 || c == 0x7F)
4459 if (c == '\n'
4460 || (c == '\r' && EQ (eol_type, Qmac)))
4462 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4463 ENCODE_RESET_PLANE_AND_REGISTER ();
4464 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4466 int i;
4468 for (i = 0; i < 4; i++)
4469 CODING_ISO_DESIGNATION (coding, i)
4470 = CODING_ISO_INITIAL (coding, i);
4472 bol_designation = ((CODING_ISO_FLAGS (coding)
4473 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4474 != 0);
4476 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4477 ENCODE_RESET_PLANE_AND_REGISTER ();
4478 EMIT_ONE_ASCII_BYTE (c);
4480 else if (ASCII_CHAR_P (c))
4482 if (ascii_compatible)
4483 EMIT_ONE_ASCII_BYTE (c);
4484 else
4486 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4487 ENCODE_ISO_CHARACTER (charset, c);
4490 else if (CHAR_BYTE8_P (c))
4492 c = CHAR_TO_BYTE8 (c);
4493 EMIT_ONE_BYTE (c);
4495 else
4497 struct charset *charset;
4499 if (preferred_charset_id >= 0)
4501 bool result;
4503 charset = CHARSET_FROM_ID (preferred_charset_id);
4504 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4505 if (! result)
4506 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4507 NULL, charset);
4509 else
4510 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4511 NULL, charset);
4512 if (!charset)
4514 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4516 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4517 charset = CHARSET_FROM_ID (charset_ascii);
4519 else
4521 c = coding->default_char;
4522 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4523 charset_list, NULL, charset);
4526 ENCODE_ISO_CHARACTER (charset, c);
4530 if (coding->mode & CODING_MODE_LAST_BLOCK
4531 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4533 ASSURE_DESTINATION (safe_room);
4534 ENCODE_RESET_PLANE_AND_REGISTER ();
4536 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4537 CODING_ISO_BOL (coding) = bol_designation;
4538 coding->produced_char += produced_chars;
4539 coding->produced = dst - coding->destination;
4540 return 0;
4544 /*** 8,9. SJIS and BIG5 handlers ***/
4546 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4547 quite widely. So, for the moment, Emacs supports them in the bare
4548 C code. But, in the future, they may be supported only by CCL. */
4550 /* SJIS is a coding system encoding three character sets: ASCII, right
4551 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4552 as is. A character of charset katakana-jisx0201 is encoded by
4553 "position-code + 0x80". A character of charset japanese-jisx0208
4554 is encoded in 2-byte but two position-codes are divided and shifted
4555 so that it fit in the range below.
4557 --- CODE RANGE of SJIS ---
4558 (character set) (range)
4559 ASCII 0x00 .. 0x7F
4560 KATAKANA-JISX0201 0xA0 .. 0xDF
4561 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4562 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4563 -------------------------------
4567 /* BIG5 is a coding system encoding two character sets: ASCII and
4568 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4569 character set and is encoded in two-byte.
4571 --- CODE RANGE of BIG5 ---
4572 (character set) (range)
4573 ASCII 0x00 .. 0x7F
4574 Big5 (1st byte) 0xA1 .. 0xFE
4575 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4576 --------------------------
4580 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581 Return true if a text is encoded in SJIS. */
4583 static bool
4584 detect_coding_sjis (struct coding_system *coding,
4585 struct coding_detection_info *detect_info)
4587 const unsigned char *src = coding->source, *src_base;
4588 const unsigned char *src_end = coding->source + coding->src_bytes;
4589 bool multibytep = coding->src_multibyte;
4590 ptrdiff_t consumed_chars = 0;
4591 int found = 0;
4592 int c;
4593 Lisp_Object attrs, charset_list;
4594 int max_first_byte_of_2_byte_code;
4596 CODING_GET_INFO (coding, attrs, charset_list);
4597 max_first_byte_of_2_byte_code
4598 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4600 detect_info->checked |= CATEGORY_MASK_SJIS;
4601 /* A coding system of this category is always ASCII compatible. */
4602 src += coding->head_ascii;
4604 while (1)
4606 src_base = src;
4607 ONE_MORE_BYTE (c);
4608 if (c < 0x80)
4609 continue;
4610 if ((c >= 0x81 && c <= 0x9F)
4611 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4613 ONE_MORE_BYTE (c);
4614 if (c < 0x40 || c == 0x7F || c > 0xFC)
4615 break;
4616 found = CATEGORY_MASK_SJIS;
4618 else if (c >= 0xA0 && c < 0xE0)
4619 found = CATEGORY_MASK_SJIS;
4620 else
4621 break;
4623 detect_info->rejected |= CATEGORY_MASK_SJIS;
4624 return 0;
4626 no_more_source:
4627 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4629 detect_info->rejected |= CATEGORY_MASK_SJIS;
4630 return 0;
4632 detect_info->found |= found;
4633 return 1;
4636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4637 Return true if a text is encoded in BIG5. */
4639 static bool
4640 detect_coding_big5 (struct coding_system *coding,
4641 struct coding_detection_info *detect_info)
4643 const unsigned char *src = coding->source, *src_base;
4644 const unsigned char *src_end = coding->source + coding->src_bytes;
4645 bool multibytep = coding->src_multibyte;
4646 ptrdiff_t consumed_chars = 0;
4647 int found = 0;
4648 int c;
4650 detect_info->checked |= CATEGORY_MASK_BIG5;
4651 /* A coding system of this category is always ASCII compatible. */
4652 src += coding->head_ascii;
4654 while (1)
4656 src_base = src;
4657 ONE_MORE_BYTE (c);
4658 if (c < 0x80)
4659 continue;
4660 if (c >= 0xA1)
4662 ONE_MORE_BYTE (c);
4663 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4664 return 0;
4665 found = CATEGORY_MASK_BIG5;
4667 else
4668 break;
4670 detect_info->rejected |= CATEGORY_MASK_BIG5;
4671 return 0;
4673 no_more_source:
4674 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676 detect_info->rejected |= CATEGORY_MASK_BIG5;
4677 return 0;
4679 detect_info->found |= found;
4680 return 1;
4683 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4685 static void
4686 decode_coding_sjis (struct coding_system *coding)
4688 const unsigned char *src = coding->source + coding->consumed;
4689 const unsigned char *src_end = coding->source + coding->src_bytes;
4690 const unsigned char *src_base;
4691 int *charbuf = coding->charbuf + coding->charbuf_used;
4692 /* We may produce one charset annotation in one loop and one more at
4693 the end. */
4694 int *charbuf_end
4695 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4696 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4697 bool multibytep = coding->src_multibyte;
4698 struct charset *charset_roman, *charset_kanji, *charset_kana;
4699 struct charset *charset_kanji2;
4700 Lisp_Object attrs, charset_list, val;
4701 ptrdiff_t char_offset = coding->produced_char;
4702 ptrdiff_t last_offset = char_offset;
4703 int last_id = charset_ascii;
4704 bool eol_dos
4705 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4706 int byte_after_cr = -1;
4708 CODING_GET_INFO (coding, attrs, charset_list);
4710 val = charset_list;
4711 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4712 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4713 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4714 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4716 while (1)
4718 int c, c1;
4719 struct charset *charset;
4721 src_base = src;
4722 consumed_chars_base = consumed_chars;
4724 if (charbuf >= charbuf_end)
4726 if (byte_after_cr >= 0)
4727 src_base--;
4728 break;
4731 if (byte_after_cr >= 0)
4732 c = byte_after_cr, byte_after_cr = -1;
4733 else
4734 ONE_MORE_BYTE (c);
4735 if (c < 0)
4736 goto invalid_code;
4737 if (c < 0x80)
4739 if (eol_dos && c == '\r')
4740 ONE_MORE_BYTE (byte_after_cr);
4741 charset = charset_roman;
4743 else if (c == 0x80 || c == 0xA0)
4744 goto invalid_code;
4745 else if (c >= 0xA1 && c <= 0xDF)
4747 /* SJIS -> JISX0201-Kana */
4748 c &= 0x7F;
4749 charset = charset_kana;
4751 else if (c <= 0xEF)
4753 /* SJIS -> JISX0208 */
4754 ONE_MORE_BYTE (c1);
4755 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4756 goto invalid_code;
4757 c = (c << 8) | c1;
4758 SJIS_TO_JIS (c);
4759 charset = charset_kanji;
4761 else if (c <= 0xFC && charset_kanji2)
4763 /* SJIS -> JISX0213-2 */
4764 ONE_MORE_BYTE (c1);
4765 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4766 goto invalid_code;
4767 c = (c << 8) | c1;
4768 SJIS_TO_JIS2 (c);
4769 charset = charset_kanji2;
4771 else
4772 goto invalid_code;
4773 if (charset->id != charset_ascii
4774 && last_id != charset->id)
4776 if (last_id != charset_ascii)
4777 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4778 last_id = charset->id;
4779 last_offset = char_offset;
4781 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4782 *charbuf++ = c;
4783 char_offset++;
4784 continue;
4786 invalid_code:
4787 src = src_base;
4788 consumed_chars = consumed_chars_base;
4789 ONE_MORE_BYTE (c);
4790 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4791 char_offset++;
4792 coding->errors++;
4795 no_more_source:
4796 if (last_id != charset_ascii)
4797 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798 coding->consumed_char += consumed_chars_base;
4799 coding->consumed = src_base - coding->source;
4800 coding->charbuf_used = charbuf - coding->charbuf;
4803 static void
4804 decode_coding_big5 (struct coding_system *coding)
4806 const unsigned char *src = coding->source + coding->consumed;
4807 const unsigned char *src_end = coding->source + coding->src_bytes;
4808 const unsigned char *src_base;
4809 int *charbuf = coding->charbuf + coding->charbuf_used;
4810 /* We may produce one charset annotation in one loop and one more at
4811 the end. */
4812 int *charbuf_end
4813 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4815 bool multibytep = coding->src_multibyte;
4816 struct charset *charset_roman, *charset_big5;
4817 Lisp_Object attrs, charset_list, val;
4818 ptrdiff_t char_offset = coding->produced_char;
4819 ptrdiff_t last_offset = char_offset;
4820 int last_id = charset_ascii;
4821 bool eol_dos
4822 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4823 int byte_after_cr = -1;
4825 CODING_GET_INFO (coding, attrs, charset_list);
4826 val = charset_list;
4827 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4830 while (1)
4832 int c, c1;
4833 struct charset *charset;
4835 src_base = src;
4836 consumed_chars_base = consumed_chars;
4838 if (charbuf >= charbuf_end)
4840 if (byte_after_cr >= 0)
4841 src_base--;
4842 break;
4845 if (byte_after_cr >= 0)
4846 c = byte_after_cr, byte_after_cr = -1;
4847 else
4848 ONE_MORE_BYTE (c);
4850 if (c < 0)
4851 goto invalid_code;
4852 if (c < 0x80)
4854 if (eol_dos && c == '\r')
4855 ONE_MORE_BYTE (byte_after_cr);
4856 charset = charset_roman;
4858 else
4860 /* BIG5 -> Big5 */
4861 if (c < 0xA1 || c > 0xFE)
4862 goto invalid_code;
4863 ONE_MORE_BYTE (c1);
4864 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4865 goto invalid_code;
4866 c = c << 8 | c1;
4867 charset = charset_big5;
4869 if (charset->id != charset_ascii
4870 && last_id != charset->id)
4872 if (last_id != charset_ascii)
4873 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4874 last_id = charset->id;
4875 last_offset = char_offset;
4877 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4878 *charbuf++ = c;
4879 char_offset++;
4880 continue;
4882 invalid_code:
4883 src = src_base;
4884 consumed_chars = consumed_chars_base;
4885 ONE_MORE_BYTE (c);
4886 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4887 char_offset++;
4888 coding->errors++;
4891 no_more_source:
4892 if (last_id != charset_ascii)
4893 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894 coding->consumed_char += consumed_chars_base;
4895 coding->consumed = src_base - coding->source;
4896 coding->charbuf_used = charbuf - coding->charbuf;
4899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4900 This function can encode charsets `ascii', `katakana-jisx0201',
4901 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4902 are sure that all these charsets are registered as official charset
4903 (i.e. do not have extended leading-codes). Characters of other
4904 charsets are produced without any encoding. */
4906 static bool
4907 encode_coding_sjis (struct coding_system *coding)
4909 bool multibytep = coding->dst_multibyte;
4910 int *charbuf = coding->charbuf;
4911 int *charbuf_end = charbuf + coding->charbuf_used;
4912 unsigned char *dst = coding->destination + coding->produced;
4913 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4914 int safe_room = 4;
4915 ptrdiff_t produced_chars = 0;
4916 Lisp_Object attrs, charset_list, val;
4917 bool ascii_compatible;
4918 struct charset *charset_kanji, *charset_kana;
4919 struct charset *charset_kanji2;
4920 int c;
4922 CODING_GET_INFO (coding, attrs, charset_list);
4923 val = XCDR (charset_list);
4924 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4925 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4926 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4928 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4930 while (charbuf < charbuf_end)
4932 ASSURE_DESTINATION (safe_room);
4933 c = *charbuf++;
4934 /* Now encode the character C. */
4935 if (ASCII_CHAR_P (c) && ascii_compatible)
4936 EMIT_ONE_ASCII_BYTE (c);
4937 else if (CHAR_BYTE8_P (c))
4939 c = CHAR_TO_BYTE8 (c);
4940 EMIT_ONE_BYTE (c);
4942 else
4944 unsigned code;
4945 struct charset *charset;
4946 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4947 &code, charset);
4949 if (!charset)
4951 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4953 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4954 charset = CHARSET_FROM_ID (charset_ascii);
4956 else
4958 c = coding->default_char;
4959 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4960 charset_list, &code, charset);
4963 if (code == CHARSET_INVALID_CODE (charset))
4964 emacs_abort ();
4965 if (charset == charset_kanji)
4967 int c1, c2;
4968 JIS_TO_SJIS (code);
4969 c1 = code >> 8, c2 = code & 0xFF;
4970 EMIT_TWO_BYTES (c1, c2);
4972 else if (charset == charset_kana)
4973 EMIT_ONE_BYTE (code | 0x80);
4974 else if (charset_kanji2 && charset == charset_kanji2)
4976 int c1, c2;
4978 c1 = code >> 8;
4979 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4980 || c1 == 0x28
4981 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4983 JIS_TO_SJIS2 (code);
4984 c1 = code >> 8, c2 = code & 0xFF;
4985 EMIT_TWO_BYTES (c1, c2);
4987 else
4988 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4990 else
4991 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4994 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995 coding->produced_char += produced_chars;
4996 coding->produced = dst - coding->destination;
4997 return 0;
5000 static bool
5001 encode_coding_big5 (struct coding_system *coding)
5003 bool multibytep = coding->dst_multibyte;
5004 int *charbuf = coding->charbuf;
5005 int *charbuf_end = charbuf + coding->charbuf_used;
5006 unsigned char *dst = coding->destination + coding->produced;
5007 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5008 int safe_room = 4;
5009 ptrdiff_t produced_chars = 0;
5010 Lisp_Object attrs, charset_list, val;
5011 bool ascii_compatible;
5012 struct charset *charset_big5;
5013 int c;
5015 CODING_GET_INFO (coding, attrs, charset_list);
5016 val = XCDR (charset_list);
5017 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5018 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5020 while (charbuf < charbuf_end)
5022 ASSURE_DESTINATION (safe_room);
5023 c = *charbuf++;
5024 /* Now encode the character C. */
5025 if (ASCII_CHAR_P (c) && ascii_compatible)
5026 EMIT_ONE_ASCII_BYTE (c);
5027 else if (CHAR_BYTE8_P (c))
5029 c = CHAR_TO_BYTE8 (c);
5030 EMIT_ONE_BYTE (c);
5032 else
5034 unsigned code;
5035 struct charset *charset;
5036 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5037 &code, charset);
5039 if (! charset)
5041 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5043 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5044 charset = CHARSET_FROM_ID (charset_ascii);
5046 else
5048 c = coding->default_char;
5049 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5050 charset_list, &code, charset);
5053 if (code == CHARSET_INVALID_CODE (charset))
5054 emacs_abort ();
5055 if (charset == charset_big5)
5057 int c1, c2;
5059 c1 = code >> 8, c2 = code & 0xFF;
5060 EMIT_TWO_BYTES (c1, c2);
5062 else
5063 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5066 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5067 coding->produced_char += produced_chars;
5068 coding->produced = dst - coding->destination;
5069 return 0;
5073 /*** 10. CCL handlers ***/
5075 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5076 Return true if a text is encoded in a coding system of which
5077 encoder/decoder are written in CCL program. */
5079 static bool
5080 detect_coding_ccl (struct coding_system *coding,
5081 struct coding_detection_info *detect_info)
5083 const unsigned char *src = coding->source, *src_base;
5084 const unsigned char *src_end = coding->source + coding->src_bytes;
5085 bool multibytep = coding->src_multibyte;
5086 ptrdiff_t consumed_chars = 0;
5087 int found = 0;
5088 unsigned char *valids;
5089 ptrdiff_t head_ascii = coding->head_ascii;
5090 Lisp_Object attrs;
5092 detect_info->checked |= CATEGORY_MASK_CCL;
5094 coding = &coding_categories[coding_category_ccl];
5095 valids = CODING_CCL_VALIDS (coding);
5096 attrs = CODING_ID_ATTRS (coding->id);
5097 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5098 src += head_ascii;
5100 while (1)
5102 int c;
5104 src_base = src;
5105 ONE_MORE_BYTE (c);
5106 if (c < 0 || ! valids[c])
5107 break;
5108 if ((valids[c] > 1))
5109 found = CATEGORY_MASK_CCL;
5111 detect_info->rejected |= CATEGORY_MASK_CCL;
5112 return 0;
5114 no_more_source:
5115 detect_info->found |= found;
5116 return 1;
5119 static void
5120 decode_coding_ccl (struct coding_system *coding)
5122 const unsigned char *src = coding->source + coding->consumed;
5123 const unsigned char *src_end = coding->source + coding->src_bytes;
5124 int *charbuf = coding->charbuf + coding->charbuf_used;
5125 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5126 ptrdiff_t consumed_chars = 0;
5127 bool multibytep = coding->src_multibyte;
5128 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5129 int source_charbuf[1024];
5130 int source_byteidx[1025];
5131 Lisp_Object attrs, charset_list;
5133 CODING_GET_INFO (coding, attrs, charset_list);
5135 while (1)
5137 const unsigned char *p = src;
5138 ptrdiff_t offset;
5139 int i = 0;
5141 if (multibytep)
5143 while (i < 1024 && p < src_end)
5145 source_byteidx[i] = p - src;
5146 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5148 source_byteidx[i] = p - src;
5150 else
5151 while (i < 1024 && p < src_end)
5152 source_charbuf[i++] = *p++;
5154 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5155 ccl->last_block = 1;
5156 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5157 charset_map_loaded = 0;
5158 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5159 charset_list);
5160 if (charset_map_loaded
5161 && (offset = coding_change_source (coding)))
5163 p += offset;
5164 src += offset;
5165 src_end += offset;
5167 charbuf += ccl->produced;
5168 if (multibytep)
5169 src += source_byteidx[ccl->consumed];
5170 else
5171 src += ccl->consumed;
5172 consumed_chars += ccl->consumed;
5173 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5174 break;
5177 switch (ccl->status)
5179 case CCL_STAT_SUSPEND_BY_SRC:
5180 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181 break;
5182 case CCL_STAT_SUSPEND_BY_DST:
5183 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184 break;
5185 case CCL_STAT_QUIT:
5186 case CCL_STAT_INVALID_CMD:
5187 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188 break;
5189 default:
5190 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191 break;
5193 coding->consumed_char += consumed_chars;
5194 coding->consumed = src - coding->source;
5195 coding->charbuf_used = charbuf - coding->charbuf;
5198 static bool
5199 encode_coding_ccl (struct coding_system *coding)
5201 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5202 bool multibytep = coding->dst_multibyte;
5203 int *charbuf = coding->charbuf;
5204 int *charbuf_end = charbuf + coding->charbuf_used;
5205 unsigned char *dst = coding->destination + coding->produced;
5206 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5207 int destination_charbuf[1024];
5208 ptrdiff_t produced_chars = 0;
5209 int i;
5210 Lisp_Object attrs, charset_list;
5212 CODING_GET_INFO (coding, attrs, charset_list);
5213 if (coding->consumed_char == coding->src_chars
5214 && coding->mode & CODING_MODE_LAST_BLOCK)
5215 ccl->last_block = 1;
5219 ptrdiff_t offset;
5221 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5222 charset_map_loaded = 0;
5223 ccl_driver (ccl, charbuf, destination_charbuf,
5224 charbuf_end - charbuf, 1024, charset_list);
5225 if (charset_map_loaded
5226 && (offset = coding_change_destination (coding)))
5227 dst += offset;
5228 if (multibytep)
5230 ASSURE_DESTINATION (ccl->produced * 2);
5231 for (i = 0; i < ccl->produced; i++)
5232 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5234 else
5236 ASSURE_DESTINATION (ccl->produced);
5237 for (i = 0; i < ccl->produced; i++)
5238 *dst++ = destination_charbuf[i] & 0xFF;
5239 produced_chars += ccl->produced;
5241 charbuf += ccl->consumed;
5242 if (ccl->status == CCL_STAT_QUIT
5243 || ccl->status == CCL_STAT_INVALID_CMD)
5244 break;
5246 while (charbuf < charbuf_end);
5248 switch (ccl->status)
5250 case CCL_STAT_SUSPEND_BY_SRC:
5251 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5252 break;
5253 case CCL_STAT_SUSPEND_BY_DST:
5254 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5255 break;
5256 case CCL_STAT_QUIT:
5257 case CCL_STAT_INVALID_CMD:
5258 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5259 break;
5260 default:
5261 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5262 break;
5265 coding->produced_char += produced_chars;
5266 coding->produced = dst - coding->destination;
5267 return 0;
5271 /*** 10, 11. no-conversion handlers ***/
5273 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5275 static void
5276 decode_coding_raw_text (struct coding_system *coding)
5278 bool eol_dos
5279 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5281 coding->chars_at_source = 1;
5282 coding->consumed_char = coding->src_chars;
5283 coding->consumed = coding->src_bytes;
5284 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5286 coding->consumed_char--;
5287 coding->consumed--;
5288 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5290 else
5291 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5294 static bool
5295 encode_coding_raw_text (struct coding_system *coding)
5297 bool multibytep = coding->dst_multibyte;
5298 int *charbuf = coding->charbuf;
5299 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5300 unsigned char *dst = coding->destination + coding->produced;
5301 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5302 ptrdiff_t produced_chars = 0;
5303 int c;
5305 if (multibytep)
5307 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5309 if (coding->src_multibyte)
5310 while (charbuf < charbuf_end)
5312 ASSURE_DESTINATION (safe_room);
5313 c = *charbuf++;
5314 if (ASCII_CHAR_P (c))
5315 EMIT_ONE_ASCII_BYTE (c);
5316 else if (CHAR_BYTE8_P (c))
5318 c = CHAR_TO_BYTE8 (c);
5319 EMIT_ONE_BYTE (c);
5321 else
5323 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5325 CHAR_STRING_ADVANCE (c, p1);
5328 EMIT_ONE_BYTE (*p0);
5329 p0++;
5331 while (p0 < p1);
5334 else
5335 while (charbuf < charbuf_end)
5337 ASSURE_DESTINATION (safe_room);
5338 c = *charbuf++;
5339 EMIT_ONE_BYTE (c);
5342 else
5344 if (coding->src_multibyte)
5346 int safe_room = MAX_MULTIBYTE_LENGTH;
5348 while (charbuf < charbuf_end)
5350 ASSURE_DESTINATION (safe_room);
5351 c = *charbuf++;
5352 if (ASCII_CHAR_P (c))
5353 *dst++ = c;
5354 else if (CHAR_BYTE8_P (c))
5355 *dst++ = CHAR_TO_BYTE8 (c);
5356 else
5357 CHAR_STRING_ADVANCE (c, dst);
5360 else
5362 ASSURE_DESTINATION (charbuf_end - charbuf);
5363 while (charbuf < charbuf_end && dst < dst_end)
5364 *dst++ = *charbuf++;
5366 produced_chars = dst - (coding->destination + coding->produced);
5368 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5369 coding->produced_char += produced_chars;
5370 coding->produced = dst - coding->destination;
5371 return 0;
5374 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5375 Return true if a text is encoded in a charset-based coding system. */
5377 static bool
5378 detect_coding_charset (struct coding_system *coding,
5379 struct coding_detection_info *detect_info)
5381 const unsigned char *src = coding->source, *src_base;
5382 const unsigned char *src_end = coding->source + coding->src_bytes;
5383 bool multibytep = coding->src_multibyte;
5384 ptrdiff_t consumed_chars = 0;
5385 Lisp_Object attrs, valids, name;
5386 int found = 0;
5387 ptrdiff_t head_ascii = coding->head_ascii;
5388 bool check_latin_extra = 0;
5390 detect_info->checked |= CATEGORY_MASK_CHARSET;
5392 coding = &coding_categories[coding_category_charset];
5393 attrs = CODING_ID_ATTRS (coding->id);
5394 valids = AREF (attrs, coding_attr_charset_valids);
5395 name = CODING_ID_NAME (coding->id);
5396 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5397 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5398 || strncmp (SSDATA (SYMBOL_NAME (name)),
5399 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5400 check_latin_extra = 1;
5402 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5403 src += head_ascii;
5405 while (1)
5407 int c;
5408 Lisp_Object val;
5409 struct charset *charset;
5410 int dim, idx;
5412 src_base = src;
5413 ONE_MORE_BYTE (c);
5414 if (c < 0)
5415 continue;
5416 val = AREF (valids, c);
5417 if (NILP (val))
5418 break;
5419 if (c >= 0x80)
5421 if (c < 0xA0
5422 && check_latin_extra
5423 && (!VECTORP (Vlatin_extra_code_table)
5424 || NILP (AREF (Vlatin_extra_code_table, c))))
5425 break;
5426 found = CATEGORY_MASK_CHARSET;
5428 if (INTEGERP (val))
5430 charset = CHARSET_FROM_ID (XFASTINT (val));
5431 dim = CHARSET_DIMENSION (charset);
5432 for (idx = 1; idx < dim; idx++)
5434 if (src == src_end)
5435 goto too_short;
5436 ONE_MORE_BYTE (c);
5437 if (c < charset->code_space[(dim - 1 - idx) * 4]
5438 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5439 break;
5441 if (idx < dim)
5442 break;
5444 else
5446 idx = 1;
5447 for (; CONSP (val); val = XCDR (val))
5449 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5450 dim = CHARSET_DIMENSION (charset);
5451 while (idx < dim)
5453 if (src == src_end)
5454 goto too_short;
5455 ONE_MORE_BYTE (c);
5456 if (c < charset->code_space[(dim - 1 - idx) * 4]
5457 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458 break;
5459 idx++;
5461 if (idx == dim)
5463 val = Qnil;
5464 break;
5467 if (CONSP (val))
5468 break;
5471 too_short:
5472 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5473 return 0;
5475 no_more_source:
5476 detect_info->found |= found;
5477 return 1;
5480 static void
5481 decode_coding_charset (struct coding_system *coding)
5483 const unsigned char *src = coding->source + coding->consumed;
5484 const unsigned char *src_end = coding->source + coding->src_bytes;
5485 const unsigned char *src_base;
5486 int *charbuf = coding->charbuf + coding->charbuf_used;
5487 /* We may produce one charset annotation in one loop and one more at
5488 the end. */
5489 int *charbuf_end
5490 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5491 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5492 bool multibytep = coding->src_multibyte;
5493 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5494 Lisp_Object valids;
5495 ptrdiff_t char_offset = coding->produced_char;
5496 ptrdiff_t last_offset = char_offset;
5497 int last_id = charset_ascii;
5498 bool eol_dos
5499 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5500 int byte_after_cr = -1;
5502 valids = AREF (attrs, coding_attr_charset_valids);
5504 while (1)
5506 int c;
5507 Lisp_Object val;
5508 struct charset *charset;
5509 int dim;
5510 int len = 1;
5511 unsigned code;
5513 src_base = src;
5514 consumed_chars_base = consumed_chars;
5516 if (charbuf >= charbuf_end)
5518 if (byte_after_cr >= 0)
5519 src_base--;
5520 break;
5523 if (byte_after_cr >= 0)
5525 c = byte_after_cr;
5526 byte_after_cr = -1;
5528 else
5530 ONE_MORE_BYTE (c);
5531 if (eol_dos && c == '\r')
5532 ONE_MORE_BYTE (byte_after_cr);
5534 if (c < 0)
5535 goto invalid_code;
5536 code = c;
5538 val = AREF (valids, c);
5539 if (! INTEGERP (val) && ! CONSP (val))
5540 goto invalid_code;
5541 if (INTEGERP (val))
5543 charset = CHARSET_FROM_ID (XFASTINT (val));
5544 dim = CHARSET_DIMENSION (charset);
5545 while (len < dim)
5547 ONE_MORE_BYTE (c);
5548 code = (code << 8) | c;
5549 len++;
5551 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5552 charset, code, c);
5554 else
5556 /* VAL is a list of charset IDs. It is assured that the
5557 list is sorted by charset dimensions (smaller one
5558 comes first). */
5559 while (CONSP (val))
5561 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5562 dim = CHARSET_DIMENSION (charset);
5563 while (len < dim)
5565 ONE_MORE_BYTE (c);
5566 code = (code << 8) | c;
5567 len++;
5569 CODING_DECODE_CHAR (coding, src, src_base,
5570 src_end, charset, code, c);
5571 if (c >= 0)
5572 break;
5573 val = XCDR (val);
5576 if (c < 0)
5577 goto invalid_code;
5578 if (charset->id != charset_ascii
5579 && last_id != charset->id)
5581 if (last_id != charset_ascii)
5582 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5583 last_id = charset->id;
5584 last_offset = char_offset;
5587 *charbuf++ = c;
5588 char_offset++;
5589 continue;
5591 invalid_code:
5592 src = src_base;
5593 consumed_chars = consumed_chars_base;
5594 ONE_MORE_BYTE (c);
5595 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5596 char_offset++;
5597 coding->errors++;
5600 no_more_source:
5601 if (last_id != charset_ascii)
5602 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5603 coding->consumed_char += consumed_chars_base;
5604 coding->consumed = src_base - coding->source;
5605 coding->charbuf_used = charbuf - coding->charbuf;
5608 static bool
5609 encode_coding_charset (struct coding_system *coding)
5611 bool multibytep = coding->dst_multibyte;
5612 int *charbuf = coding->charbuf;
5613 int *charbuf_end = charbuf + coding->charbuf_used;
5614 unsigned char *dst = coding->destination + coding->produced;
5615 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5616 int safe_room = MAX_MULTIBYTE_LENGTH;
5617 ptrdiff_t produced_chars = 0;
5618 Lisp_Object attrs, charset_list;
5619 bool ascii_compatible;
5620 int c;
5622 CODING_GET_INFO (coding, attrs, charset_list);
5623 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5625 while (charbuf < charbuf_end)
5627 struct charset *charset;
5628 unsigned code;
5630 ASSURE_DESTINATION (safe_room);
5631 c = *charbuf++;
5632 if (ascii_compatible && ASCII_CHAR_P (c))
5633 EMIT_ONE_ASCII_BYTE (c);
5634 else if (CHAR_BYTE8_P (c))
5636 c = CHAR_TO_BYTE8 (c);
5637 EMIT_ONE_BYTE (c);
5639 else
5641 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5642 &code, charset);
5644 if (charset)
5646 if (CHARSET_DIMENSION (charset) == 1)
5647 EMIT_ONE_BYTE (code);
5648 else if (CHARSET_DIMENSION (charset) == 2)
5649 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5650 else if (CHARSET_DIMENSION (charset) == 3)
5651 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5652 else
5653 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5654 (code >> 8) & 0xFF, code & 0xFF);
5656 else
5658 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5659 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5660 else
5661 c = coding->default_char;
5662 EMIT_ONE_BYTE (c);
5667 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5668 coding->produced_char += produced_chars;
5669 coding->produced = dst - coding->destination;
5670 return 0;
5674 /*** 7. C library functions ***/
5676 /* Setup coding context CODING from information about CODING_SYSTEM.
5677 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5678 CODING_SYSTEM is invalid, signal an error. */
5680 void
5681 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5683 Lisp_Object attrs;
5684 Lisp_Object eol_type;
5685 Lisp_Object coding_type;
5686 Lisp_Object val;
5688 if (NILP (coding_system))
5689 coding_system = Qundecided;
5691 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5693 attrs = CODING_ID_ATTRS (coding->id);
5694 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5696 coding->mode = 0;
5697 if (VECTORP (eol_type))
5698 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5699 | CODING_REQUIRE_DETECTION_MASK);
5700 else if (! EQ (eol_type, Qunix))
5701 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5702 | CODING_REQUIRE_ENCODING_MASK);
5703 else
5704 coding->common_flags = 0;
5705 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5706 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5707 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5708 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5709 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5710 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5712 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5713 coding->max_charset_id = SCHARS (val) - 1;
5714 coding->safe_charsets = SDATA (val);
5715 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5716 coding->carryover_bytes = 0;
5718 coding_type = CODING_ATTR_TYPE (attrs);
5719 if (EQ (coding_type, Qundecided))
5721 coding->detector = NULL;
5722 coding->decoder = decode_coding_raw_text;
5723 coding->encoder = encode_coding_raw_text;
5724 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5725 coding->spec.undecided.inhibit_nbd
5726 = (encode_inhibit_flag
5727 (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5728 coding->spec.undecided.inhibit_ied
5729 = (encode_inhibit_flag
5730 (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5731 coding->spec.undecided.prefer_utf_8
5732 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5734 else if (EQ (coding_type, Qiso_2022))
5736 int i;
5737 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5739 /* Invoke graphic register 0 to plane 0. */
5740 CODING_ISO_INVOCATION (coding, 0) = 0;
5741 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5742 CODING_ISO_INVOCATION (coding, 1)
5743 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5744 /* Setup the initial status of designation. */
5745 for (i = 0; i < 4; i++)
5746 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5747 /* Not single shifting initially. */
5748 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5749 /* Beginning of buffer should also be regarded as bol. */
5750 CODING_ISO_BOL (coding) = 1;
5751 coding->detector = detect_coding_iso_2022;
5752 coding->decoder = decode_coding_iso_2022;
5753 coding->encoder = encode_coding_iso_2022;
5754 if (flags & CODING_ISO_FLAG_SAFE)
5755 coding->mode |= CODING_MODE_SAFE_ENCODING;
5756 coding->common_flags
5757 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5758 | CODING_REQUIRE_FLUSHING_MASK);
5759 if (flags & CODING_ISO_FLAG_COMPOSITION)
5760 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5761 if (flags & CODING_ISO_FLAG_DESIGNATION)
5762 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5763 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5765 setup_iso_safe_charsets (attrs);
5766 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5767 coding->max_charset_id = SCHARS (val) - 1;
5768 coding->safe_charsets = SDATA (val);
5770 CODING_ISO_FLAGS (coding) = flags;
5771 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5772 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5773 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5774 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5776 else if (EQ (coding_type, Qcharset))
5778 coding->detector = detect_coding_charset;
5779 coding->decoder = decode_coding_charset;
5780 coding->encoder = encode_coding_charset;
5781 coding->common_flags
5782 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5784 else if (EQ (coding_type, Qutf_8))
5786 val = AREF (attrs, coding_attr_utf_bom);
5787 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5788 : EQ (val, Qt) ? utf_with_bom
5789 : utf_without_bom);
5790 coding->detector = detect_coding_utf_8;
5791 coding->decoder = decode_coding_utf_8;
5792 coding->encoder = encode_coding_utf_8;
5793 coding->common_flags
5794 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5796 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5798 else if (EQ (coding_type, Qutf_16))
5800 val = AREF (attrs, coding_attr_utf_bom);
5801 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5802 : EQ (val, Qt) ? utf_with_bom
5803 : utf_without_bom);
5804 val = AREF (attrs, coding_attr_utf_16_endian);
5805 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5806 : utf_16_little_endian);
5807 CODING_UTF_16_SURROGATE (coding) = 0;
5808 coding->detector = detect_coding_utf_16;
5809 coding->decoder = decode_coding_utf_16;
5810 coding->encoder = encode_coding_utf_16;
5811 coding->common_flags
5812 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5813 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5814 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5816 else if (EQ (coding_type, Qccl))
5818 coding->detector = detect_coding_ccl;
5819 coding->decoder = decode_coding_ccl;
5820 coding->encoder = encode_coding_ccl;
5821 coding->common_flags
5822 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5823 | CODING_REQUIRE_FLUSHING_MASK);
5825 else if (EQ (coding_type, Qemacs_mule))
5827 coding->detector = detect_coding_emacs_mule;
5828 coding->decoder = decode_coding_emacs_mule;
5829 coding->encoder = encode_coding_emacs_mule;
5830 coding->common_flags
5831 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5833 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5835 Lisp_Object tail, safe_charsets;
5836 int max_charset_id = 0;
5838 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5839 tail = XCDR (tail))
5840 if (max_charset_id < XFASTINT (XCAR (tail)))
5841 max_charset_id = XFASTINT (XCAR (tail));
5842 safe_charsets = make_uninit_string (max_charset_id + 1);
5843 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5844 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5845 tail = XCDR (tail))
5846 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5847 coding->max_charset_id = max_charset_id;
5848 coding->safe_charsets = SDATA (safe_charsets);
5850 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5851 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5853 else if (EQ (coding_type, Qshift_jis))
5855 coding->detector = detect_coding_sjis;
5856 coding->decoder = decode_coding_sjis;
5857 coding->encoder = encode_coding_sjis;
5858 coding->common_flags
5859 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5861 else if (EQ (coding_type, Qbig5))
5863 coding->detector = detect_coding_big5;
5864 coding->decoder = decode_coding_big5;
5865 coding->encoder = encode_coding_big5;
5866 coding->common_flags
5867 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5869 else /* EQ (coding_type, Qraw_text) */
5871 coding->detector = NULL;
5872 coding->decoder = decode_coding_raw_text;
5873 coding->encoder = encode_coding_raw_text;
5874 if (! EQ (eol_type, Qunix))
5876 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5877 if (! VECTORP (eol_type))
5878 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5883 return;
5886 /* Return a list of charsets supported by CODING. */
5888 Lisp_Object
5889 coding_charset_list (struct coding_system *coding)
5891 Lisp_Object attrs, charset_list;
5893 CODING_GET_INFO (coding, attrs, charset_list);
5894 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5896 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5898 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5899 charset_list = Viso_2022_charset_list;
5901 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5903 charset_list = Vemacs_mule_charset_list;
5905 return charset_list;
5909 /* Return a list of charsets supported by CODING-SYSTEM. */
5911 Lisp_Object
5912 coding_system_charset_list (Lisp_Object coding_system)
5914 ptrdiff_t id;
5915 Lisp_Object attrs, charset_list;
5917 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5918 attrs = CODING_ID_ATTRS (id);
5920 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5922 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5924 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5925 charset_list = Viso_2022_charset_list;
5926 else
5927 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5929 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5931 charset_list = Vemacs_mule_charset_list;
5933 else
5935 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5937 return charset_list;
5941 /* Return raw-text or one of its subsidiaries that has the same
5942 eol_type as CODING-SYSTEM. */
5944 Lisp_Object
5945 raw_text_coding_system (Lisp_Object coding_system)
5947 Lisp_Object spec, attrs;
5948 Lisp_Object eol_type, raw_text_eol_type;
5950 if (NILP (coding_system))
5951 return Qraw_text;
5952 spec = CODING_SYSTEM_SPEC (coding_system);
5953 attrs = AREF (spec, 0);
5955 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5956 return coding_system;
5958 eol_type = AREF (spec, 2);
5959 if (VECTORP (eol_type))
5960 return Qraw_text;
5961 spec = CODING_SYSTEM_SPEC (Qraw_text);
5962 raw_text_eol_type = AREF (spec, 2);
5963 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5964 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5965 : AREF (raw_text_eol_type, 2));
5969 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5970 the subsidiary that has the same eol-spec as PARENT (if it is not
5971 nil and specifies end-of-line format) or the system's setting
5972 (system_eol_type). */
5974 Lisp_Object
5975 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5977 Lisp_Object spec, eol_type;
5979 if (NILP (coding_system))
5980 coding_system = Qraw_text;
5981 spec = CODING_SYSTEM_SPEC (coding_system);
5982 eol_type = AREF (spec, 2);
5983 if (VECTORP (eol_type))
5985 Lisp_Object parent_eol_type;
5987 if (! NILP (parent))
5989 Lisp_Object parent_spec;
5991 parent_spec = CODING_SYSTEM_SPEC (parent);
5992 parent_eol_type = AREF (parent_spec, 2);
5993 if (VECTORP (parent_eol_type))
5994 parent_eol_type = system_eol_type;
5996 else
5997 parent_eol_type = system_eol_type;
5998 if (EQ (parent_eol_type, Qunix))
5999 coding_system = AREF (eol_type, 0);
6000 else if (EQ (parent_eol_type, Qdos))
6001 coding_system = AREF (eol_type, 1);
6002 else if (EQ (parent_eol_type, Qmac))
6003 coding_system = AREF (eol_type, 2);
6005 return coding_system;
6009 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6010 decided for writing to a process. If not, complement them, and
6011 return a new coding system. */
6013 Lisp_Object
6014 complement_process_encoding_system (Lisp_Object coding_system)
6016 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6017 Lisp_Object spec, attrs;
6018 int i;
6020 for (i = 0; i < 3; i++)
6022 if (i == 1)
6023 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6024 else if (i == 2)
6025 coding_system = preferred_coding_system ();
6026 spec = CODING_SYSTEM_SPEC (coding_system);
6027 if (NILP (spec))
6028 continue;
6029 attrs = AREF (spec, 0);
6030 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6031 coding_base = CODING_ATTR_BASE_NAME (attrs);
6032 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6033 eol_base = coding_system;
6034 if (! NILP (coding_base) && ! NILP (eol_base))
6035 break;
6038 if (i > 0)
6039 /* The original CODING_SYSTEM didn't specify text-conversion or
6040 eol-conversion. Be sure that we return a fully complemented
6041 coding system. */
6042 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6043 return coding_system;
6047 /* Emacs has a mechanism to automatically detect a coding system if it
6048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6049 it's impossible to distinguish some coding systems accurately
6050 because they use the same range of codes. So, at first, coding
6051 systems are categorized into 7, those are:
6053 o coding-category-emacs-mule
6055 The category for a coding system which has the same code range
6056 as Emacs' internal format. Assigned the coding-system (Lisp
6057 symbol) `emacs-mule' by default.
6059 o coding-category-sjis
6061 The category for a coding system which has the same code range
6062 as SJIS. Assigned the coding-system (Lisp
6063 symbol) `japanese-shift-jis' by default.
6065 o coding-category-iso-7
6067 The category for a coding system which has the same code range
6068 as ISO2022 of 7-bit environment. This doesn't use any locking
6069 shift and single shift functions. This can encode/decode all
6070 charsets. Assigned the coding-system (Lisp symbol)
6071 `iso-2022-7bit' by default.
6073 o coding-category-iso-7-tight
6075 Same as coding-category-iso-7 except that this can
6076 encode/decode only the specified charsets.
6078 o coding-category-iso-8-1
6080 The category for a coding system which has the same code range
6081 as ISO2022 of 8-bit environment and graphic plane 1 used only
6082 for DIMENSION1 charset. This doesn't use any locking shift
6083 and single shift functions. Assigned the coding-system (Lisp
6084 symbol) `iso-latin-1' by default.
6086 o coding-category-iso-8-2
6088 The category for a coding system which has the same code range
6089 as ISO2022 of 8-bit environment and graphic plane 1 used only
6090 for DIMENSION2 charset. This doesn't use any locking shift
6091 and single shift functions. Assigned the coding-system (Lisp
6092 symbol) `japanese-iso-8bit' by default.
6094 o coding-category-iso-7-else
6096 The category for a coding system which has the same code range
6097 as ISO2022 of 7-bit environment but uses locking shift or
6098 single shift functions. Assigned the coding-system (Lisp
6099 symbol) `iso-2022-7bit-lock' by default.
6101 o coding-category-iso-8-else
6103 The category for a coding system which has the same code range
6104 as ISO2022 of 8-bit environment but uses locking shift or
6105 single shift functions. Assigned the coding-system (Lisp
6106 symbol) `iso-2022-8bit-ss2' by default.
6108 o coding-category-big5
6110 The category for a coding system which has the same code range
6111 as BIG5. Assigned the coding-system (Lisp symbol)
6112 `cn-big5' by default.
6114 o coding-category-utf-8
6116 The category for a coding system which has the same code range
6117 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6118 symbol) `utf-8' by default.
6120 o coding-category-utf-16-be
6122 The category for a coding system in which a text has an
6123 Unicode signature (cf. Unicode Standard) in the order of BIG
6124 endian at the head. Assigned the coding-system (Lisp symbol)
6125 `utf-16-be' by default.
6127 o coding-category-utf-16-le
6129 The category for a coding system in which a text has an
6130 Unicode signature (cf. Unicode Standard) in the order of
6131 LITTLE endian at the head. Assigned the coding-system (Lisp
6132 symbol) `utf-16-le' by default.
6134 o coding-category-ccl
6136 The category for a coding system of which encoder/decoder is
6137 written in CCL programs. The default value is nil, i.e., no
6138 coding system is assigned.
6140 o coding-category-binary
6142 The category for a coding system not categorized in any of the
6143 above. Assigned the coding-system (Lisp symbol)
6144 `no-conversion' by default.
6146 Each of them is a Lisp symbol and the value is an actual
6147 `coding-system's (this is also a Lisp symbol) assigned by a user.
6148 What Emacs does actually is to detect a category of coding system.
6149 Then, it uses a `coding-system' assigned to it. If Emacs can't
6150 decide only one possible category, it selects a category of the
6151 highest priority. Priorities of categories are also specified by a
6152 user in a Lisp variable `coding-category-list'.
6156 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6157 int eol_seen);
6160 /* Return the number of ASCII characters at the head of the source.
6161 By side effects, set coding->head_ascii and update
6162 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6163 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6164 reliable only when all the source bytes are ASCII. */
6166 static int
6167 check_ascii (struct coding_system *coding)
6169 const unsigned char *src, *end;
6170 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6171 int eol_seen = coding->eol_seen;
6173 coding_set_source (coding);
6174 src = coding->source;
6175 end = src + coding->src_bytes;
6177 if (inhibit_eol_conversion
6178 || SYMBOLP (eol_type))
6180 /* We don't have to check EOL format. */
6181 while (src < end && !( *src & 0x80))
6183 if (*src++ == '\n')
6184 eol_seen |= EOL_SEEN_LF;
6187 else
6189 end--; /* We look ahead one byte for "CR LF". */
6190 while (src < end)
6192 int c = *src;
6194 if (c & 0x80)
6195 break;
6196 src++;
6197 if (c == '\r')
6199 if (*src == '\n')
6201 eol_seen |= EOL_SEEN_CRLF;
6202 src++;
6204 else
6205 eol_seen |= EOL_SEEN_CR;
6207 else if (c == '\n')
6208 eol_seen |= EOL_SEEN_LF;
6210 if (src == end)
6212 int c = *src;
6214 /* All bytes but the last one C are ASCII. */
6215 if (! (c & 0x80))
6217 if (c == '\r')
6218 eol_seen |= EOL_SEEN_CR;
6219 else if (c == '\n')
6220 eol_seen |= EOL_SEEN_LF;
6221 src++;
6225 coding->head_ascii = src - coding->source;
6226 coding->eol_seen = eol_seen;
6227 return (coding->head_ascii);
6231 /* Return the number of characters at the source if all the bytes are
6232 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6233 effects, update coding->eol_seen. The value of coding->eol_seen is
6234 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6235 the value is reliable only when all the source bytes are valid
6236 UTF-8. */
6238 static int
6239 check_utf_8 (struct coding_system *coding)
6241 const unsigned char *src, *end;
6242 int eol_seen;
6243 int nchars = coding->head_ascii;
6245 if (coding->head_ascii < 0)
6246 check_ascii (coding);
6247 else
6248 coding_set_source (coding);
6249 src = coding->source + coding->head_ascii;
6250 /* We look ahead one byte for CR LF. */
6251 end = coding->source + coding->src_bytes - 1;
6252 eol_seen = coding->eol_seen;
6253 while (src < end)
6255 int c = *src;
6257 if (UTF_8_1_OCTET_P (*src))
6259 src++;
6260 if (c < 0x20)
6262 if (c == '\r')
6264 if (*src == '\n')
6266 eol_seen |= EOL_SEEN_CRLF;
6267 src++;
6268 nchars++;
6270 else
6271 eol_seen |= EOL_SEEN_CR;
6273 else if (c == '\n')
6274 eol_seen |= EOL_SEEN_LF;
6277 else if (UTF_8_2_OCTET_LEADING_P (c))
6279 if (c < 0xC2 /* overlong sequence */
6280 || src + 1 >= end
6281 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6282 return -1;
6283 src += 2;
6285 else if (UTF_8_3_OCTET_LEADING_P (c))
6287 if (src + 2 >= end
6288 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6289 && UTF_8_EXTRA_OCTET_P (src[2])))
6290 return -1;
6291 c = (((c & 0xF) << 12)
6292 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6293 if (c < 0x800 /* overlong sequence */
6294 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6295 return -1;
6296 src += 3;
6298 else if (UTF_8_4_OCTET_LEADING_P (c))
6300 if (src + 3 >= end
6301 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6302 && UTF_8_EXTRA_OCTET_P (src[2])
6303 && UTF_8_EXTRA_OCTET_P (src[3])))
6304 return -1;
6305 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6306 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6307 if (c < 0x10000 /* overlong sequence */
6308 || c >= 0x110000) /* non-Unicode character */
6309 return -1;
6310 src += 4;
6312 else
6313 return -1;
6314 nchars++;
6317 if (src == end)
6319 if (! UTF_8_1_OCTET_P (*src))
6320 return -1;
6321 nchars++;
6322 if (*src == '\r')
6323 eol_seen |= EOL_SEEN_CR;
6324 else if (*src == '\n')
6325 eol_seen |= EOL_SEEN_LF;
6327 coding->eol_seen = eol_seen;
6328 return nchars;
6332 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6333 SOURCE is encoded. If CATEGORY is one of
6334 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6335 two-byte, else they are encoded by one-byte.
6337 Return one of EOL_SEEN_XXX. */
6339 #define MAX_EOL_CHECK_COUNT 3
6341 static int
6342 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6343 enum coding_category category)
6345 const unsigned char *src = source, *src_end = src + src_bytes;
6346 unsigned char c;
6347 int total = 0;
6348 int eol_seen = EOL_SEEN_NONE;
6350 if ((1 << category) & CATEGORY_MASK_UTF_16)
6352 bool msb = category == (coding_category_utf_16_le
6353 | coding_category_utf_16_le_nosig);
6354 bool lsb = !msb;
6356 while (src + 1 < src_end)
6358 c = src[lsb];
6359 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6361 int this_eol;
6363 if (c == '\n')
6364 this_eol = EOL_SEEN_LF;
6365 else if (src + 3 >= src_end
6366 || src[msb + 2] != 0
6367 || src[lsb + 2] != '\n')
6368 this_eol = EOL_SEEN_CR;
6369 else
6371 this_eol = EOL_SEEN_CRLF;
6372 src += 2;
6375 if (eol_seen == EOL_SEEN_NONE)
6376 /* This is the first end-of-line. */
6377 eol_seen = this_eol;
6378 else if (eol_seen != this_eol)
6380 /* The found type is different from what found before.
6381 Allow for stray ^M characters in DOS EOL files. */
6382 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6383 || (eol_seen == EOL_SEEN_CRLF
6384 && this_eol == EOL_SEEN_CR))
6385 eol_seen = EOL_SEEN_CRLF;
6386 else
6388 eol_seen = EOL_SEEN_LF;
6389 break;
6392 if (++total == MAX_EOL_CHECK_COUNT)
6393 break;
6395 src += 2;
6398 else
6399 while (src < src_end)
6401 c = *src++;
6402 if (c == '\n' || c == '\r')
6404 int this_eol;
6406 if (c == '\n')
6407 this_eol = EOL_SEEN_LF;
6408 else if (src >= src_end || *src != '\n')
6409 this_eol = EOL_SEEN_CR;
6410 else
6411 this_eol = EOL_SEEN_CRLF, src++;
6413 if (eol_seen == EOL_SEEN_NONE)
6414 /* This is the first end-of-line. */
6415 eol_seen = this_eol;
6416 else if (eol_seen != this_eol)
6418 /* The found type is different from what found before.
6419 Allow for stray ^M characters in DOS EOL files. */
6420 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6421 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6422 eol_seen = EOL_SEEN_CRLF;
6423 else
6425 eol_seen = EOL_SEEN_LF;
6426 break;
6429 if (++total == MAX_EOL_CHECK_COUNT)
6430 break;
6433 return eol_seen;
6437 static Lisp_Object
6438 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6440 Lisp_Object eol_type;
6442 eol_type = CODING_ID_EOL_TYPE (coding->id);
6443 if (! VECTORP (eol_type))
6444 /* Already adjusted. */
6445 return eol_type;
6446 if (eol_seen & EOL_SEEN_LF)
6448 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6449 eol_type = Qunix;
6451 else if (eol_seen & EOL_SEEN_CRLF)
6453 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6454 eol_type = Qdos;
6456 else if (eol_seen & EOL_SEEN_CR)
6458 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6459 eol_type = Qmac;
6461 return eol_type;
6464 /* Detect how a text specified in CODING is encoded. If a coding
6465 system is detected, update fields of CODING by the detected coding
6466 system. */
6468 static void
6469 detect_coding (struct coding_system *coding)
6471 const unsigned char *src, *src_end;
6472 unsigned int saved_mode = coding->mode;
6473 Lisp_Object found = Qnil;
6474 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6476 coding->consumed = coding->consumed_char = 0;
6477 coding->produced = coding->produced_char = 0;
6478 coding_set_source (coding);
6480 src_end = coding->source + coding->src_bytes;
6482 coding->eol_seen = EOL_SEEN_NONE;
6483 /* If we have not yet decided the text encoding type, detect it
6484 now. */
6485 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6487 int c, i;
6488 struct coding_detection_info detect_info;
6489 bool null_byte_found = 0, eight_bit_found = 0;
6490 bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6491 inhibit_null_byte_detection);
6492 bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6493 inhibit_iso_escape_detection);
6494 bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6496 coding->head_ascii = 0;
6497 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6498 for (src = coding->source; src < src_end; src++)
6500 c = *src;
6501 if (c & 0x80)
6503 eight_bit_found = 1;
6504 if (null_byte_found)
6505 break;
6507 else if (c < 0x20)
6509 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6510 && ! inhibit_ied
6511 && ! detect_info.checked)
6513 if (detect_coding_iso_2022 (coding, &detect_info))
6515 /* We have scanned the whole data. */
6516 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6518 /* We didn't find an 8-bit code. We may
6519 have found a null-byte, but it's very
6520 rare that a binary file conforms to
6521 ISO-2022. */
6522 src = src_end;
6523 coding->head_ascii = src - coding->source;
6525 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6526 break;
6529 else if (! c && !inhibit_nbd)
6531 null_byte_found = 1;
6532 if (eight_bit_found)
6533 break;
6535 else if (! disable_ascii_optimization
6536 && ! inhibit_eol_conversion)
6538 if (c == '\r')
6540 if (src < src_end && src[1] == '\n')
6542 coding->eol_seen |= EOL_SEEN_CRLF;
6543 src++;
6544 if (! eight_bit_found)
6545 coding->head_ascii++;
6547 else
6548 coding->eol_seen |= EOL_SEEN_CR;
6550 else if (c == '\n')
6552 coding->eol_seen |= EOL_SEEN_LF;
6556 if (! eight_bit_found)
6557 coding->head_ascii++;
6559 else if (! eight_bit_found)
6560 coding->head_ascii++;
6563 if (null_byte_found || eight_bit_found
6564 || coding->head_ascii < coding->src_bytes
6565 || detect_info.found)
6567 enum coding_category category;
6568 struct coding_system *this;
6570 if (coding->head_ascii == coding->src_bytes)
6571 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6572 for (i = 0; i < coding_category_raw_text; i++)
6574 category = coding_priorities[i];
6575 this = coding_categories + category;
6576 if (detect_info.found & (1 << category))
6577 break;
6579 else
6581 if (null_byte_found)
6583 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6584 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6586 else if (prefer_utf_8
6587 && detect_coding_utf_8 (coding, &detect_info))
6589 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6590 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6592 for (i = 0; i < coding_category_raw_text; i++)
6594 category = coding_priorities[i];
6595 this = coding_categories + category;
6596 /* Some of this->detector (e.g. detect_coding_sjis)
6597 require this information. */
6598 coding->id = this->id;
6599 if (this->id < 0)
6601 /* No coding system of this category is defined. */
6602 detect_info.rejected |= (1 << category);
6604 else if (category >= coding_category_raw_text)
6605 continue;
6606 else if (detect_info.checked & (1 << category))
6608 if (detect_info.found & (1 << category))
6609 break;
6611 else if ((*(this->detector)) (coding, &detect_info)
6612 && detect_info.found & (1 << category))
6613 break;
6617 if (i < coding_category_raw_text)
6619 if (category == coding_category_utf_8_auto)
6621 Lisp_Object coding_systems;
6623 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6624 coding_attr_utf_bom);
6625 if (CONSP (coding_systems))
6627 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6628 found = XCAR (coding_systems);
6629 else
6630 found = XCDR (coding_systems);
6632 else
6633 found = CODING_ID_NAME (this->id);
6635 else if (category == coding_category_utf_16_auto)
6637 Lisp_Object coding_systems;
6639 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6640 coding_attr_utf_bom);
6641 if (CONSP (coding_systems))
6643 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6644 found = XCAR (coding_systems);
6645 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6646 found = XCDR (coding_systems);
6648 else
6649 found = CODING_ID_NAME (this->id);
6651 else
6652 found = CODING_ID_NAME (this->id);
6654 else if (null_byte_found)
6655 found = Qno_conversion;
6656 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6657 == CATEGORY_MASK_ANY)
6658 found = Qraw_text;
6659 else if (detect_info.rejected)
6660 for (i = 0; i < coding_category_raw_text; i++)
6661 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6663 this = coding_categories + coding_priorities[i];
6664 found = CODING_ID_NAME (this->id);
6665 break;
6669 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6670 == coding_category_utf_8_auto)
6672 Lisp_Object coding_systems;
6673 struct coding_detection_info detect_info;
6675 coding_systems
6676 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6677 detect_info.found = detect_info.rejected = 0;
6678 if (check_ascii (coding) == coding->src_bytes)
6680 if (CONSP (coding_systems))
6681 found = XCDR (coding_systems);
6683 else
6685 if (CONSP (coding_systems)
6686 && detect_coding_utf_8 (coding, &detect_info))
6688 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6689 found = XCAR (coding_systems);
6690 else
6691 found = XCDR (coding_systems);
6695 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6696 == coding_category_utf_16_auto)
6698 Lisp_Object coding_systems;
6699 struct coding_detection_info detect_info;
6701 coding_systems
6702 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6703 detect_info.found = detect_info.rejected = 0;
6704 coding->head_ascii = 0;
6705 if (CONSP (coding_systems)
6706 && detect_coding_utf_16 (coding, &detect_info))
6708 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6709 found = XCAR (coding_systems);
6710 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6711 found = XCDR (coding_systems);
6715 if (! NILP (found))
6717 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6718 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6719 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6720 : EOL_SEEN_LF);
6722 setup_coding_system (found, coding);
6723 if (specified_eol != EOL_SEEN_NONE)
6724 adjust_coding_eol_type (coding, specified_eol);
6727 coding->mode = saved_mode;
6731 static void
6732 decode_eol (struct coding_system *coding)
6734 Lisp_Object eol_type;
6735 unsigned char *p, *pbeg, *pend;
6737 eol_type = CODING_ID_EOL_TYPE (coding->id);
6738 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6739 return;
6741 if (NILP (coding->dst_object))
6742 pbeg = coding->destination;
6743 else
6744 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6745 pend = pbeg + coding->produced;
6747 if (VECTORP (eol_type))
6749 int eol_seen = EOL_SEEN_NONE;
6751 for (p = pbeg; p < pend; p++)
6753 if (*p == '\n')
6754 eol_seen |= EOL_SEEN_LF;
6755 else if (*p == '\r')
6757 if (p + 1 < pend && *(p + 1) == '\n')
6759 eol_seen |= EOL_SEEN_CRLF;
6760 p++;
6762 else
6763 eol_seen |= EOL_SEEN_CR;
6766 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6767 if ((eol_seen & EOL_SEEN_CRLF) != 0
6768 && (eol_seen & EOL_SEEN_CR) != 0
6769 && (eol_seen & EOL_SEEN_LF) == 0)
6770 eol_seen = EOL_SEEN_CRLF;
6771 else if (eol_seen != EOL_SEEN_NONE
6772 && eol_seen != EOL_SEEN_LF
6773 && eol_seen != EOL_SEEN_CRLF
6774 && eol_seen != EOL_SEEN_CR)
6775 eol_seen = EOL_SEEN_LF;
6776 if (eol_seen != EOL_SEEN_NONE)
6777 eol_type = adjust_coding_eol_type (coding, eol_seen);
6780 if (EQ (eol_type, Qmac))
6782 for (p = pbeg; p < pend; p++)
6783 if (*p == '\r')
6784 *p = '\n';
6786 else if (EQ (eol_type, Qdos))
6788 ptrdiff_t n = 0;
6790 if (NILP (coding->dst_object))
6792 /* Start deleting '\r' from the tail to minimize the memory
6793 movement. */
6794 for (p = pend - 2; p >= pbeg; p--)
6795 if (*p == '\r')
6797 memmove (p, p + 1, pend-- - p - 1);
6798 n++;
6801 else
6803 ptrdiff_t pos_byte = coding->dst_pos_byte;
6804 ptrdiff_t pos = coding->dst_pos;
6805 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6807 while (pos < pos_end)
6809 p = BYTE_POS_ADDR (pos_byte);
6810 if (*p == '\r' && p[1] == '\n')
6812 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6813 n++;
6814 pos_end--;
6816 pos++;
6817 if (coding->dst_multibyte)
6818 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6819 else
6820 pos_byte++;
6823 coding->produced -= n;
6824 coding->produced_char -= n;
6829 /* Return a translation table (or list of them) from coding system
6830 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6831 not ENCODEP). */
6833 static Lisp_Object
6834 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6836 Lisp_Object standard, translation_table;
6837 Lisp_Object val;
6839 if (NILP (Venable_character_translation))
6841 if (max_lookup)
6842 *max_lookup = 0;
6843 return Qnil;
6845 if (encodep)
6846 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6847 standard = Vstandard_translation_table_for_encode;
6848 else
6849 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6850 standard = Vstandard_translation_table_for_decode;
6851 if (NILP (translation_table))
6852 translation_table = standard;
6853 else
6855 if (SYMBOLP (translation_table))
6856 translation_table = Fget (translation_table, Qtranslation_table);
6857 else if (CONSP (translation_table))
6859 translation_table = Fcopy_sequence (translation_table);
6860 for (val = translation_table; CONSP (val); val = XCDR (val))
6861 if (SYMBOLP (XCAR (val)))
6862 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6864 if (CHAR_TABLE_P (standard))
6866 if (CONSP (translation_table))
6867 translation_table = nconc2 (translation_table,
6868 Fcons (standard, Qnil));
6869 else
6870 translation_table = Fcons (translation_table,
6871 Fcons (standard, Qnil));
6875 if (max_lookup)
6877 *max_lookup = 1;
6878 if (CHAR_TABLE_P (translation_table)
6879 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6881 val = XCHAR_TABLE (translation_table)->extras[1];
6882 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6883 *max_lookup = XFASTINT (val);
6885 else if (CONSP (translation_table))
6887 Lisp_Object tail;
6889 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6890 if (CHAR_TABLE_P (XCAR (tail))
6891 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6893 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6894 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6895 *max_lookup = XFASTINT (tailval);
6899 return translation_table;
6902 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6903 do { \
6904 trans = Qnil; \
6905 if (CHAR_TABLE_P (table)) \
6907 trans = CHAR_TABLE_REF (table, c); \
6908 if (CHARACTERP (trans)) \
6909 c = XFASTINT (trans), trans = Qnil; \
6911 else if (CONSP (table)) \
6913 Lisp_Object tail; \
6915 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6916 if (CHAR_TABLE_P (XCAR (tail))) \
6918 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6919 if (CHARACTERP (trans)) \
6920 c = XFASTINT (trans), trans = Qnil; \
6921 else if (! NILP (trans)) \
6922 break; \
6925 } while (0)
6928 /* Return a translation of character(s) at BUF according to TRANS.
6929 TRANS is TO-CHAR or ((FROM . TO) ...) where
6930 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6931 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6932 translation is found, and Qnil if not found..
6933 If BUF is too short to lookup characters in FROM, return Qt. */
6935 static Lisp_Object
6936 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6939 if (INTEGERP (trans))
6940 return trans;
6941 for (; CONSP (trans); trans = XCDR (trans))
6943 Lisp_Object val = XCAR (trans);
6944 Lisp_Object from = XCAR (val);
6945 ptrdiff_t len = ASIZE (from);
6946 ptrdiff_t i;
6948 for (i = 0; i < len; i++)
6950 if (buf + i == buf_end)
6951 return Qt;
6952 if (XINT (AREF (from, i)) != buf[i])
6953 break;
6955 if (i == len)
6956 return val;
6958 return Qnil;
6962 static int
6963 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6964 bool last_block)
6966 unsigned char *dst = coding->destination + coding->produced;
6967 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6968 ptrdiff_t produced;
6969 ptrdiff_t produced_chars = 0;
6970 int carryover = 0;
6972 if (! coding->chars_at_source)
6974 /* Source characters are in coding->charbuf. */
6975 int *buf = coding->charbuf;
6976 int *buf_end = buf + coding->charbuf_used;
6978 if (EQ (coding->src_object, coding->dst_object))
6980 coding_set_source (coding);
6981 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6984 while (buf < buf_end)
6986 int c = *buf;
6987 ptrdiff_t i;
6989 if (c >= 0)
6991 ptrdiff_t from_nchars = 1, to_nchars = 1;
6992 Lisp_Object trans = Qnil;
6994 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6995 if (! NILP (trans))
6997 trans = get_translation (trans, buf, buf_end);
6998 if (INTEGERP (trans))
6999 c = XINT (trans);
7000 else if (CONSP (trans))
7002 from_nchars = ASIZE (XCAR (trans));
7003 trans = XCDR (trans);
7004 if (INTEGERP (trans))
7005 c = XINT (trans);
7006 else
7008 to_nchars = ASIZE (trans);
7009 c = XINT (AREF (trans, 0));
7012 else if (EQ (trans, Qt) && ! last_block)
7013 break;
7016 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7018 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7019 / MAX_MULTIBYTE_LENGTH)
7020 < to_nchars)
7021 memory_full (SIZE_MAX);
7022 dst = alloc_destination (coding,
7023 buf_end - buf
7024 + MAX_MULTIBYTE_LENGTH * to_nchars,
7025 dst);
7026 if (EQ (coding->src_object, coding->dst_object))
7028 coding_set_source (coding);
7029 dst_end = (((unsigned char *) coding->source)
7030 + coding->consumed);
7032 else
7033 dst_end = coding->destination + coding->dst_bytes;
7036 for (i = 0; i < to_nchars; i++)
7038 if (i > 0)
7039 c = XINT (AREF (trans, i));
7040 if (coding->dst_multibyte
7041 || ! CHAR_BYTE8_P (c))
7042 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7043 else
7044 *dst++ = CHAR_TO_BYTE8 (c);
7046 produced_chars += to_nchars;
7047 buf += from_nchars;
7049 else
7050 /* This is an annotation datum. (-C) is the length. */
7051 buf += -c;
7053 carryover = buf_end - buf;
7055 else
7057 /* Source characters are at coding->source. */
7058 const unsigned char *src = coding->source;
7059 const unsigned char *src_end = src + coding->consumed;
7061 if (EQ (coding->dst_object, coding->src_object))
7062 dst_end = (unsigned char *) src;
7063 if (coding->src_multibyte != coding->dst_multibyte)
7065 if (coding->src_multibyte)
7067 bool multibytep = 1;
7068 ptrdiff_t consumed_chars = 0;
7070 while (1)
7072 const unsigned char *src_base = src;
7073 int c;
7075 ONE_MORE_BYTE (c);
7076 if (dst == dst_end)
7078 if (EQ (coding->src_object, coding->dst_object))
7079 dst_end = (unsigned char *) src;
7080 if (dst == dst_end)
7082 ptrdiff_t offset = src - coding->source;
7084 dst = alloc_destination (coding, src_end - src + 1,
7085 dst);
7086 dst_end = coding->destination + coding->dst_bytes;
7087 coding_set_source (coding);
7088 src = coding->source + offset;
7089 src_end = coding->source + coding->consumed;
7090 if (EQ (coding->src_object, coding->dst_object))
7091 dst_end = (unsigned char *) src;
7094 *dst++ = c;
7095 produced_chars++;
7097 no_more_source:
7100 else
7101 while (src < src_end)
7103 bool multibytep = 1;
7104 int c = *src++;
7106 if (dst >= dst_end - 1)
7108 if (EQ (coding->src_object, coding->dst_object))
7109 dst_end = (unsigned char *) src;
7110 if (dst >= dst_end - 1)
7112 ptrdiff_t offset = src - coding->source;
7113 ptrdiff_t more_bytes;
7115 if (EQ (coding->src_object, coding->dst_object))
7116 more_bytes = ((src_end - src) / 2) + 2;
7117 else
7118 more_bytes = src_end - src + 2;
7119 dst = alloc_destination (coding, more_bytes, dst);
7120 dst_end = coding->destination + coding->dst_bytes;
7121 coding_set_source (coding);
7122 src = coding->source + offset;
7123 src_end = coding->source + coding->consumed;
7124 if (EQ (coding->src_object, coding->dst_object))
7125 dst_end = (unsigned char *) src;
7128 EMIT_ONE_BYTE (c);
7131 else
7133 if (!EQ (coding->src_object, coding->dst_object))
7135 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7137 if (require > 0)
7139 ptrdiff_t offset = src - coding->source;
7141 dst = alloc_destination (coding, require, dst);
7142 coding_set_source (coding);
7143 src = coding->source + offset;
7144 src_end = coding->source + coding->consumed;
7147 produced_chars = coding->consumed_char;
7148 while (src < src_end)
7149 *dst++ = *src++;
7153 produced = dst - (coding->destination + coding->produced);
7154 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7155 insert_from_gap (produced_chars, produced, 0);
7156 coding->produced += produced;
7157 coding->produced_char += produced_chars;
7158 return carryover;
7161 /* Compose text in CODING->object according to the annotation data at
7162 CHARBUF. CHARBUF is an array:
7163 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7166 static void
7167 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7169 int len;
7170 ptrdiff_t to;
7171 enum composition_method method;
7172 Lisp_Object components;
7174 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7175 to = pos + charbuf[2];
7176 method = (enum composition_method) (charbuf[4]);
7178 if (method == COMPOSITION_RELATIVE)
7179 components = Qnil;
7180 else
7182 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7183 int i, j;
7185 if (method == COMPOSITION_WITH_RULE)
7186 len = charbuf[2] * 3 - 2;
7187 charbuf += MAX_ANNOTATION_LENGTH;
7188 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7189 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7191 if (charbuf[i] >= 0)
7192 args[j] = make_number (charbuf[i]);
7193 else
7195 i++;
7196 args[j] = make_number (charbuf[i] % 0x100);
7199 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7201 compose_text (pos, to, components, Qnil, coding->dst_object);
7205 /* Put `charset' property on text in CODING->object according to
7206 the annotation data at CHARBUF. CHARBUF is an array:
7207 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7210 static void
7211 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7213 ptrdiff_t from = pos - charbuf[2];
7214 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7216 Fput_text_property (make_number (from), make_number (pos),
7217 Qcharset, CHARSET_NAME (charset),
7218 coding->dst_object);
7222 #define CHARBUF_SIZE 0x4000
7224 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7225 do { \
7226 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7227 coding->charbuf_size = CHARBUF_SIZE; \
7228 } while (0)
7231 static void
7232 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7234 int *charbuf = coding->charbuf;
7235 int *charbuf_end = charbuf + coding->charbuf_used;
7237 if (NILP (coding->dst_object))
7238 return;
7240 while (charbuf < charbuf_end)
7242 if (*charbuf >= 0)
7243 pos++, charbuf++;
7244 else
7246 int len = -*charbuf;
7248 if (len > 2)
7249 switch (charbuf[1])
7251 case CODING_ANNOTATE_COMPOSITION_MASK:
7252 produce_composition (coding, charbuf, pos);
7253 break;
7254 case CODING_ANNOTATE_CHARSET_MASK:
7255 produce_charset (coding, charbuf, pos);
7256 break;
7258 charbuf += len;
7263 /* Decode the data at CODING->src_object into CODING->dst_object.
7264 CODING->src_object is a buffer, a string, or nil.
7265 CODING->dst_object is a buffer.
7267 If CODING->src_object is a buffer, it must be the current buffer.
7268 In this case, if CODING->src_pos is positive, it is a position of
7269 the source text in the buffer, otherwise, the source text is in the
7270 gap area of the buffer, and CODING->src_pos specifies the offset of
7271 the text from GPT (which must be the same as PT). If this is the
7272 same buffer as CODING->dst_object, CODING->src_pos must be
7273 negative.
7275 If CODING->src_object is a string, CODING->src_pos is an index to
7276 that string.
7278 If CODING->src_object is nil, CODING->source must already point to
7279 the non-relocatable memory area. In this case, CODING->src_pos is
7280 an offset from CODING->source.
7282 The decoded data is inserted at the current point of the buffer
7283 CODING->dst_object.
7286 static void
7287 decode_coding (struct coding_system *coding)
7289 Lisp_Object attrs;
7290 Lisp_Object undo_list;
7291 Lisp_Object translation_table;
7292 struct ccl_spec cclspec;
7293 int carryover;
7294 int i;
7296 USE_SAFE_ALLOCA;
7298 if (BUFFERP (coding->src_object)
7299 && coding->src_pos > 0
7300 && coding->src_pos < GPT
7301 && coding->src_pos + coding->src_chars > GPT)
7302 move_gap_both (coding->src_pos, coding->src_pos_byte);
7304 undo_list = Qt;
7305 if (BUFFERP (coding->dst_object))
7307 set_buffer_internal (XBUFFER (coding->dst_object));
7308 if (GPT != PT)
7309 move_gap_both (PT, PT_BYTE);
7311 /* We must disable undo_list in order to record the whole insert
7312 transaction via record_insert at the end. But doing so also
7313 disables the recording of the first change to the undo_list.
7314 Therefore we check for first change here and record it via
7315 record_first_change if needed. */
7316 if (MODIFF <= SAVE_MODIFF)
7317 record_first_change ();
7319 undo_list = BVAR (current_buffer, undo_list);
7320 bset_undo_list (current_buffer, Qt);
7323 coding->consumed = coding->consumed_char = 0;
7324 coding->produced = coding->produced_char = 0;
7325 coding->chars_at_source = 0;
7326 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7327 coding->errors = 0;
7329 ALLOC_CONVERSION_WORK_AREA (coding);
7331 attrs = CODING_ID_ATTRS (coding->id);
7332 translation_table = get_translation_table (attrs, 0, NULL);
7334 carryover = 0;
7335 if (coding->decoder == decode_coding_ccl)
7337 coding->spec.ccl = &cclspec;
7338 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7342 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7344 coding_set_source (coding);
7345 coding->annotated = 0;
7346 coding->charbuf_used = carryover;
7347 (*(coding->decoder)) (coding);
7348 coding_set_destination (coding);
7349 carryover = produce_chars (coding, translation_table, 0);
7350 if (coding->annotated)
7351 produce_annotation (coding, pos);
7352 for (i = 0; i < carryover; i++)
7353 coding->charbuf[i]
7354 = coding->charbuf[coding->charbuf_used - carryover + i];
7356 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7357 || (coding->consumed < coding->src_bytes
7358 && (coding->result == CODING_RESULT_SUCCESS
7359 || coding->result == CODING_RESULT_INVALID_SRC)));
7361 if (carryover > 0)
7363 coding_set_destination (coding);
7364 coding->charbuf_used = carryover;
7365 produce_chars (coding, translation_table, 1);
7368 coding->carryover_bytes = 0;
7369 if (coding->consumed < coding->src_bytes)
7371 int nbytes = coding->src_bytes - coding->consumed;
7372 const unsigned char *src;
7374 coding_set_source (coding);
7375 coding_set_destination (coding);
7376 src = coding->source + coding->consumed;
7378 if (coding->mode & CODING_MODE_LAST_BLOCK)
7380 /* Flush out unprocessed data as binary chars. We are sure
7381 that the number of data is less than the size of
7382 coding->charbuf. */
7383 coding->charbuf_used = 0;
7384 coding->chars_at_source = 0;
7386 while (nbytes-- > 0)
7388 int c = *src++;
7390 if (c & 0x80)
7391 c = BYTE8_TO_CHAR (c);
7392 coding->charbuf[coding->charbuf_used++] = c;
7394 produce_chars (coding, Qnil, 1);
7396 else
7398 /* Record unprocessed bytes in coding->carryover. We are
7399 sure that the number of data is less than the size of
7400 coding->carryover. */
7401 unsigned char *p = coding->carryover;
7403 if (nbytes > sizeof coding->carryover)
7404 nbytes = sizeof coding->carryover;
7405 coding->carryover_bytes = nbytes;
7406 while (nbytes-- > 0)
7407 *p++ = *src++;
7409 coding->consumed = coding->src_bytes;
7412 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7413 && !inhibit_eol_conversion)
7414 decode_eol (coding);
7415 if (BUFFERP (coding->dst_object))
7417 bset_undo_list (current_buffer, undo_list);
7418 record_insert (coding->dst_pos, coding->produced_char);
7421 SAFE_FREE ();
7425 /* Extract an annotation datum from a composition starting at POS and
7426 ending before LIMIT of CODING->src_object (buffer or string), store
7427 the data in BUF, set *STOP to a starting position of the next
7428 composition (if any) or to LIMIT, and return the address of the
7429 next element of BUF.
7431 If such an annotation is not found, set *STOP to a starting
7432 position of a composition after POS (if any) or to LIMIT, and
7433 return BUF. */
7435 static int *
7436 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7437 struct coding_system *coding, int *buf,
7438 ptrdiff_t *stop)
7440 ptrdiff_t start, end;
7441 Lisp_Object prop;
7443 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7444 || end > limit)
7445 *stop = limit;
7446 else if (start > pos)
7447 *stop = start;
7448 else
7450 if (start == pos)
7452 /* We found a composition. Store the corresponding
7453 annotation data in BUF. */
7454 int *head = buf;
7455 enum composition_method method = COMPOSITION_METHOD (prop);
7456 int nchars = COMPOSITION_LENGTH (prop);
7458 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7459 if (method != COMPOSITION_RELATIVE)
7461 Lisp_Object components;
7462 ptrdiff_t i, len, i_byte;
7464 components = COMPOSITION_COMPONENTS (prop);
7465 if (VECTORP (components))
7467 len = ASIZE (components);
7468 for (i = 0; i < len; i++)
7469 *buf++ = XINT (AREF (components, i));
7471 else if (STRINGP (components))
7473 len = SCHARS (components);
7474 i = i_byte = 0;
7475 while (i < len)
7477 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7478 buf++;
7481 else if (INTEGERP (components))
7483 len = 1;
7484 *buf++ = XINT (components);
7486 else if (CONSP (components))
7488 for (len = 0; CONSP (components);
7489 len++, components = XCDR (components))
7490 *buf++ = XINT (XCAR (components));
7492 else
7493 emacs_abort ();
7494 *head -= len;
7498 if (find_composition (end, limit, &start, &end, &prop,
7499 coding->src_object)
7500 && end <= limit)
7501 *stop = start;
7502 else
7503 *stop = limit;
7505 return buf;
7509 /* Extract an annotation datum from a text property `charset' at POS of
7510 CODING->src_object (buffer of string), store the data in BUF, set
7511 *STOP to the position where the value of `charset' property changes
7512 (limiting by LIMIT), and return the address of the next element of
7513 BUF.
7515 If the property value is nil, set *STOP to the position where the
7516 property value is non-nil (limiting by LIMIT), and return BUF. */
7518 static int *
7519 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7520 struct coding_system *coding, int *buf,
7521 ptrdiff_t *stop)
7523 Lisp_Object val, next;
7524 int id;
7526 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7527 if (! NILP (val) && CHARSETP (val))
7528 id = XINT (CHARSET_SYMBOL_ID (val));
7529 else
7530 id = -1;
7531 ADD_CHARSET_DATA (buf, 0, id);
7532 next = Fnext_single_property_change (make_number (pos), Qcharset,
7533 coding->src_object,
7534 make_number (limit));
7535 *stop = XINT (next);
7536 return buf;
7540 static void
7541 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7542 int max_lookup)
7544 int *buf = coding->charbuf;
7545 int *buf_end = coding->charbuf + coding->charbuf_size;
7546 const unsigned char *src = coding->source + coding->consumed;
7547 const unsigned char *src_end = coding->source + coding->src_bytes;
7548 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7549 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7550 bool multibytep = coding->src_multibyte;
7551 Lisp_Object eol_type;
7552 int c;
7553 ptrdiff_t stop, stop_composition, stop_charset;
7554 int *lookup_buf = NULL;
7556 if (! NILP (translation_table))
7557 lookup_buf = alloca (sizeof (int) * max_lookup);
7559 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7560 if (VECTORP (eol_type))
7561 eol_type = Qunix;
7563 /* Note: composition handling is not yet implemented. */
7564 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7566 if (NILP (coding->src_object))
7567 stop = stop_composition = stop_charset = end_pos;
7568 else
7570 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7571 stop = stop_composition = pos;
7572 else
7573 stop = stop_composition = end_pos;
7574 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7575 stop = stop_charset = pos;
7576 else
7577 stop_charset = end_pos;
7580 /* Compensate for CRLF and conversion. */
7581 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7582 while (buf < buf_end)
7584 Lisp_Object trans;
7586 if (pos == stop)
7588 if (pos == end_pos)
7589 break;
7590 if (pos == stop_composition)
7591 buf = handle_composition_annotation (pos, end_pos, coding,
7592 buf, &stop_composition);
7593 if (pos == stop_charset)
7594 buf = handle_charset_annotation (pos, end_pos, coding,
7595 buf, &stop_charset);
7596 stop = (stop_composition < stop_charset
7597 ? stop_composition : stop_charset);
7600 if (! multibytep)
7602 int bytes;
7604 if (coding->encoder == encode_coding_raw_text
7605 || coding->encoder == encode_coding_ccl)
7606 c = *src++, pos++;
7607 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7608 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7609 else
7610 c = BYTE8_TO_CHAR (*src), src++, pos++;
7612 else
7613 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7614 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7615 c = '\n';
7616 if (! EQ (eol_type, Qunix))
7618 if (c == '\n')
7620 if (EQ (eol_type, Qdos))
7621 *buf++ = '\r';
7622 else
7623 c = '\r';
7627 trans = Qnil;
7628 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7629 if (NILP (trans))
7630 *buf++ = c;
7631 else
7633 ptrdiff_t from_nchars = 1, to_nchars = 1;
7634 int *lookup_buf_end;
7635 const unsigned char *p = src;
7636 int i;
7638 lookup_buf[0] = c;
7639 for (i = 1; i < max_lookup && p < src_end; i++)
7640 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7641 lookup_buf_end = lookup_buf + i;
7642 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7643 if (INTEGERP (trans))
7644 c = XINT (trans);
7645 else if (CONSP (trans))
7647 from_nchars = ASIZE (XCAR (trans));
7648 trans = XCDR (trans);
7649 if (INTEGERP (trans))
7650 c = XINT (trans);
7651 else
7653 to_nchars = ASIZE (trans);
7654 if (buf_end - buf < to_nchars)
7655 break;
7656 c = XINT (AREF (trans, 0));
7659 else
7660 break;
7661 *buf++ = c;
7662 for (i = 1; i < to_nchars; i++)
7663 *buf++ = XINT (AREF (trans, i));
7664 for (i = 1; i < from_nchars; i++, pos++)
7665 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7669 coding->consumed = src - coding->source;
7670 coding->consumed_char = pos - coding->src_pos;
7671 coding->charbuf_used = buf - coding->charbuf;
7672 coding->chars_at_source = 0;
7676 /* Encode the text at CODING->src_object into CODING->dst_object.
7677 CODING->src_object is a buffer or a string.
7678 CODING->dst_object is a buffer or nil.
7680 If CODING->src_object is a buffer, it must be the current buffer.
7681 In this case, if CODING->src_pos is positive, it is a position of
7682 the source text in the buffer, otherwise. the source text is in the
7683 gap area of the buffer, and coding->src_pos specifies the offset of
7684 the text from GPT (which must be the same as PT). If this is the
7685 same buffer as CODING->dst_object, CODING->src_pos must be
7686 negative and CODING should not have `pre-write-conversion'.
7688 If CODING->src_object is a string, CODING should not have
7689 `pre-write-conversion'.
7691 If CODING->dst_object is a buffer, the encoded data is inserted at
7692 the current point of that buffer.
7694 If CODING->dst_object is nil, the encoded data is placed at the
7695 memory area specified by CODING->destination. */
7697 static void
7698 encode_coding (struct coding_system *coding)
7700 Lisp_Object attrs;
7701 Lisp_Object translation_table;
7702 int max_lookup;
7703 struct ccl_spec cclspec;
7705 USE_SAFE_ALLOCA;
7707 attrs = CODING_ID_ATTRS (coding->id);
7708 if (coding->encoder == encode_coding_raw_text)
7709 translation_table = Qnil, max_lookup = 0;
7710 else
7711 translation_table = get_translation_table (attrs, 1, &max_lookup);
7713 if (BUFFERP (coding->dst_object))
7715 set_buffer_internal (XBUFFER (coding->dst_object));
7716 coding->dst_multibyte
7717 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7720 coding->consumed = coding->consumed_char = 0;
7721 coding->produced = coding->produced_char = 0;
7722 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7723 coding->errors = 0;
7725 ALLOC_CONVERSION_WORK_AREA (coding);
7727 if (coding->encoder == encode_coding_ccl)
7729 coding->spec.ccl = &cclspec;
7730 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7732 do {
7733 coding_set_source (coding);
7734 consume_chars (coding, translation_table, max_lookup);
7735 coding_set_destination (coding);
7736 (*(coding->encoder)) (coding);
7737 } while (coding->consumed_char < coding->src_chars);
7739 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7740 insert_from_gap (coding->produced_char, coding->produced, 0);
7742 SAFE_FREE ();
7746 /* Name (or base name) of work buffer for code conversion. */
7747 static Lisp_Object Vcode_conversion_workbuf_name;
7749 /* A working buffer used by the top level conversion. Once it is
7750 created, it is never destroyed. It has the name
7751 Vcode_conversion_workbuf_name. The other working buffers are
7752 destroyed after the use is finished, and their names are modified
7753 versions of Vcode_conversion_workbuf_name. */
7754 static Lisp_Object Vcode_conversion_reused_workbuf;
7756 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7757 static bool reused_workbuf_in_use;
7760 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7761 multibyteness of returning buffer. */
7763 static Lisp_Object
7764 make_conversion_work_buffer (bool multibyte)
7766 Lisp_Object name, workbuf;
7767 struct buffer *current;
7769 if (reused_workbuf_in_use)
7771 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7772 workbuf = Fget_buffer_create (name);
7774 else
7776 reused_workbuf_in_use = 1;
7777 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7778 Vcode_conversion_reused_workbuf
7779 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7780 workbuf = Vcode_conversion_reused_workbuf;
7782 current = current_buffer;
7783 set_buffer_internal (XBUFFER (workbuf));
7784 /* We can't allow modification hooks to run in the work buffer. For
7785 instance, directory_files_internal assumes that file decoding
7786 doesn't compile new regexps. */
7787 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7788 Ferase_buffer ();
7789 bset_undo_list (current_buffer, Qt);
7790 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7791 set_buffer_internal (current);
7792 return workbuf;
7796 static Lisp_Object
7797 code_conversion_restore (Lisp_Object arg)
7799 Lisp_Object current, workbuf;
7800 struct gcpro gcpro1;
7802 GCPRO1 (arg);
7803 current = XCAR (arg);
7804 workbuf = XCDR (arg);
7805 if (! NILP (workbuf))
7807 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7808 reused_workbuf_in_use = 0;
7809 else
7810 Fkill_buffer (workbuf);
7812 set_buffer_internal (XBUFFER (current));
7813 UNGCPRO;
7814 return Qnil;
7817 Lisp_Object
7818 code_conversion_save (bool with_work_buf, bool multibyte)
7820 Lisp_Object workbuf = Qnil;
7822 if (with_work_buf)
7823 workbuf = make_conversion_work_buffer (multibyte);
7824 record_unwind_protect (code_conversion_restore,
7825 Fcons (Fcurrent_buffer (), workbuf));
7826 return workbuf;
7829 void
7830 decode_coding_gap (struct coding_system *coding,
7831 ptrdiff_t chars, ptrdiff_t bytes)
7833 ptrdiff_t count = SPECPDL_INDEX ();
7834 Lisp_Object attrs;
7836 coding->src_object = Fcurrent_buffer ();
7837 coding->src_chars = chars;
7838 coding->src_bytes = bytes;
7839 coding->src_pos = -chars;
7840 coding->src_pos_byte = -bytes;
7841 coding->src_multibyte = chars < bytes;
7842 coding->dst_object = coding->src_object;
7843 coding->dst_pos = PT;
7844 coding->dst_pos_byte = PT_BYTE;
7845 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7847 coding->head_ascii = -1;
7848 coding->detected_utf8_chars = -1;
7849 coding->eol_seen = EOL_SEEN_NONE;
7850 if (CODING_REQUIRE_DETECTION (coding))
7851 detect_coding (coding);
7852 attrs = CODING_ID_ATTRS (coding->id);
7853 if (! disable_ascii_optimization
7854 && ! coding->src_multibyte
7855 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7856 && NILP (CODING_ATTR_POST_READ (attrs))
7857 && NILP (get_translation_table (attrs, 0, NULL)))
7859 chars = coding->head_ascii;
7860 if (chars < 0)
7861 chars = check_ascii (coding);
7862 if (chars != bytes)
7864 /* There exists a non-ASCII byte. */
7865 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7867 if (coding->detected_utf8_chars >= 0)
7868 chars = coding->detected_utf8_chars;
7869 else
7870 chars = check_utf_8 (coding);
7871 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7872 && coding->head_ascii == 0
7873 && coding->source[0] == UTF_8_BOM_1
7874 && coding->source[1] == UTF_8_BOM_2
7875 && coding->source[2] == UTF_8_BOM_3)
7877 chars--;
7878 bytes -= 3;
7879 coding->src_bytes -= 3;
7882 else
7883 chars = -1;
7885 if (chars >= 0)
7887 Lisp_Object eol_type;
7889 eol_type = CODING_ID_EOL_TYPE (coding->id);
7890 if (VECTORP (eol_type))
7892 if (coding->eol_seen != EOL_SEEN_NONE)
7893 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7895 if (EQ (eol_type, Qmac))
7897 unsigned char *src_end = GAP_END_ADDR;
7898 unsigned char *src = src_end - coding->src_bytes;
7900 while (src < src_end)
7902 if (*src++ == '\r')
7903 src[-1] = '\n';
7906 else if (EQ (eol_type, Qdos))
7908 unsigned char *src = GAP_END_ADDR;
7909 unsigned char *src_beg = src - coding->src_bytes;
7910 unsigned char *dst = src;
7911 ptrdiff_t diff;
7913 while (src_beg < src)
7915 *--dst = *--src;
7916 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7917 src--;
7919 diff = dst - src;
7920 bytes -= diff;
7921 chars -= diff;
7923 coding->produced = bytes;
7924 coding->produced_char = chars;
7925 insert_from_gap (chars, bytes, 1);
7926 return;
7929 code_conversion_save (0, 0);
7931 coding->mode |= CODING_MODE_LAST_BLOCK;
7932 current_buffer->text->inhibit_shrinking = 1;
7933 decode_coding (coding);
7934 current_buffer->text->inhibit_shrinking = 0;
7936 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7938 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7939 Lisp_Object val;
7941 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7942 val = call1 (CODING_ATTR_POST_READ (attrs),
7943 make_number (coding->produced_char));
7944 CHECK_NATNUM (val);
7945 coding->produced_char += Z - prev_Z;
7946 coding->produced += Z_BYTE - prev_Z_BYTE;
7949 unbind_to (count, Qnil);
7953 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7954 SRC_OBJECT into DST_OBJECT by coding context CODING.
7956 SRC_OBJECT is a buffer, a string, or Qnil.
7958 If it is a buffer, the text is at point of the buffer. FROM and TO
7959 are positions in the buffer.
7961 If it is a string, the text is at the beginning of the string.
7962 FROM and TO are indices to the string.
7964 If it is nil, the text is at coding->source. FROM and TO are
7965 indices to coding->source.
7967 DST_OBJECT is a buffer, Qt, or Qnil.
7969 If it is a buffer, the decoded text is inserted at point of the
7970 buffer. If the buffer is the same as SRC_OBJECT, the source text
7971 is deleted.
7973 If it is Qt, a string is made from the decoded text, and
7974 set in CODING->dst_object.
7976 If it is Qnil, the decoded text is stored at CODING->destination.
7977 The caller must allocate CODING->dst_bytes bytes at
7978 CODING->destination by xmalloc. If the decoded text is longer than
7979 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7982 void
7983 decode_coding_object (struct coding_system *coding,
7984 Lisp_Object src_object,
7985 ptrdiff_t from, ptrdiff_t from_byte,
7986 ptrdiff_t to, ptrdiff_t to_byte,
7987 Lisp_Object dst_object)
7989 ptrdiff_t count = SPECPDL_INDEX ();
7990 unsigned char *destination IF_LINT (= NULL);
7991 ptrdiff_t dst_bytes IF_LINT (= 0);
7992 ptrdiff_t chars = to - from;
7993 ptrdiff_t bytes = to_byte - from_byte;
7994 Lisp_Object attrs;
7995 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7996 bool need_marker_adjustment = 0;
7997 Lisp_Object old_deactivate_mark;
7999 old_deactivate_mark = Vdeactivate_mark;
8001 if (NILP (dst_object))
8003 destination = coding->destination;
8004 dst_bytes = coding->dst_bytes;
8007 coding->src_object = src_object;
8008 coding->src_chars = chars;
8009 coding->src_bytes = bytes;
8010 coding->src_multibyte = chars < bytes;
8012 if (STRINGP (src_object))
8014 coding->src_pos = from;
8015 coding->src_pos_byte = from_byte;
8017 else if (BUFFERP (src_object))
8019 set_buffer_internal (XBUFFER (src_object));
8020 if (from != GPT)
8021 move_gap_both (from, from_byte);
8022 if (EQ (src_object, dst_object))
8024 struct Lisp_Marker *tail;
8026 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8028 tail->need_adjustment
8029 = tail->charpos == (tail->insertion_type ? from : to);
8030 need_marker_adjustment |= tail->need_adjustment;
8032 saved_pt = PT, saved_pt_byte = PT_BYTE;
8033 TEMP_SET_PT_BOTH (from, from_byte);
8034 current_buffer->text->inhibit_shrinking = 1;
8035 del_range_both (from, from_byte, to, to_byte, 1);
8036 coding->src_pos = -chars;
8037 coding->src_pos_byte = -bytes;
8039 else
8041 coding->src_pos = from;
8042 coding->src_pos_byte = from_byte;
8046 if (CODING_REQUIRE_DETECTION (coding))
8047 detect_coding (coding);
8048 attrs = CODING_ID_ATTRS (coding->id);
8050 if (EQ (dst_object, Qt)
8051 || (! NILP (CODING_ATTR_POST_READ (attrs))
8052 && NILP (dst_object)))
8054 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8055 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8056 coding->dst_pos = BEG;
8057 coding->dst_pos_byte = BEG_BYTE;
8059 else if (BUFFERP (dst_object))
8061 code_conversion_save (0, 0);
8062 coding->dst_object = dst_object;
8063 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8064 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8065 coding->dst_multibyte
8066 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8068 else
8070 code_conversion_save (0, 0);
8071 coding->dst_object = Qnil;
8072 /* Most callers presume this will return a multibyte result, and they
8073 won't use `binary' or `raw-text' anyway, so let's not worry about
8074 CODING_FOR_UNIBYTE. */
8075 coding->dst_multibyte = 1;
8078 decode_coding (coding);
8080 if (BUFFERP (coding->dst_object))
8081 set_buffer_internal (XBUFFER (coding->dst_object));
8083 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8085 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8086 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8087 Lisp_Object val;
8089 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8090 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8091 old_deactivate_mark);
8092 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8093 make_number (coding->produced_char));
8094 UNGCPRO;
8095 CHECK_NATNUM (val);
8096 coding->produced_char += Z - prev_Z;
8097 coding->produced += Z_BYTE - prev_Z_BYTE;
8100 if (EQ (dst_object, Qt))
8102 coding->dst_object = Fbuffer_string ();
8104 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8106 set_buffer_internal (XBUFFER (coding->dst_object));
8107 if (dst_bytes < coding->produced)
8109 eassert (coding->produced > 0);
8110 destination = xrealloc (destination, coding->produced);
8111 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8112 move_gap_both (BEGV, BEGV_BYTE);
8113 memcpy (destination, BEGV_ADDR, coding->produced);
8114 coding->destination = destination;
8118 if (saved_pt >= 0)
8120 /* This is the case of:
8121 (BUFFERP (src_object) && EQ (src_object, dst_object))
8122 As we have moved PT while replacing the original buffer
8123 contents, we must recover it now. */
8124 set_buffer_internal (XBUFFER (src_object));
8125 current_buffer->text->inhibit_shrinking = 0;
8126 if (saved_pt < from)
8127 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8128 else if (saved_pt < from + chars)
8129 TEMP_SET_PT_BOTH (from, from_byte);
8130 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8131 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8132 saved_pt_byte + (coding->produced - bytes));
8133 else
8134 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8135 saved_pt_byte + (coding->produced - bytes));
8137 if (need_marker_adjustment)
8139 struct Lisp_Marker *tail;
8141 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8142 if (tail->need_adjustment)
8144 tail->need_adjustment = 0;
8145 if (tail->insertion_type)
8147 tail->bytepos = from_byte;
8148 tail->charpos = from;
8150 else
8152 tail->bytepos = from_byte + coding->produced;
8153 tail->charpos
8154 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8155 ? tail->bytepos : from + coding->produced_char);
8161 Vdeactivate_mark = old_deactivate_mark;
8162 unbind_to (count, coding->dst_object);
8166 void
8167 encode_coding_object (struct coding_system *coding,
8168 Lisp_Object src_object,
8169 ptrdiff_t from, ptrdiff_t from_byte,
8170 ptrdiff_t to, ptrdiff_t to_byte,
8171 Lisp_Object dst_object)
8173 ptrdiff_t count = SPECPDL_INDEX ();
8174 ptrdiff_t chars = to - from;
8175 ptrdiff_t bytes = to_byte - from_byte;
8176 Lisp_Object attrs;
8177 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8178 bool need_marker_adjustment = 0;
8179 bool kill_src_buffer = 0;
8180 Lisp_Object old_deactivate_mark;
8182 old_deactivate_mark = Vdeactivate_mark;
8184 coding->src_object = src_object;
8185 coding->src_chars = chars;
8186 coding->src_bytes = bytes;
8187 coding->src_multibyte = chars < bytes;
8189 attrs = CODING_ID_ATTRS (coding->id);
8191 if (EQ (src_object, dst_object))
8193 struct Lisp_Marker *tail;
8195 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8197 tail->need_adjustment
8198 = tail->charpos == (tail->insertion_type ? from : to);
8199 need_marker_adjustment |= tail->need_adjustment;
8203 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8205 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8206 set_buffer_internal (XBUFFER (coding->src_object));
8207 if (STRINGP (src_object))
8208 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8209 else if (BUFFERP (src_object))
8210 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8211 else
8212 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8214 if (EQ (src_object, dst_object))
8216 set_buffer_internal (XBUFFER (src_object));
8217 saved_pt = PT, saved_pt_byte = PT_BYTE;
8218 del_range_both (from, from_byte, to, to_byte, 1);
8219 set_buffer_internal (XBUFFER (coding->src_object));
8223 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8225 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8226 old_deactivate_mark);
8227 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8228 make_number (BEG), make_number (Z));
8229 UNGCPRO;
8231 if (XBUFFER (coding->src_object) != current_buffer)
8232 kill_src_buffer = 1;
8233 coding->src_object = Fcurrent_buffer ();
8234 if (BEG != GPT)
8235 move_gap_both (BEG, BEG_BYTE);
8236 coding->src_chars = Z - BEG;
8237 coding->src_bytes = Z_BYTE - BEG_BYTE;
8238 coding->src_pos = BEG;
8239 coding->src_pos_byte = BEG_BYTE;
8240 coding->src_multibyte = Z < Z_BYTE;
8242 else if (STRINGP (src_object))
8244 code_conversion_save (0, 0);
8245 coding->src_pos = from;
8246 coding->src_pos_byte = from_byte;
8248 else if (BUFFERP (src_object))
8250 code_conversion_save (0, 0);
8251 set_buffer_internal (XBUFFER (src_object));
8252 if (EQ (src_object, dst_object))
8254 saved_pt = PT, saved_pt_byte = PT_BYTE;
8255 coding->src_object = del_range_1 (from, to, 1, 1);
8256 coding->src_pos = 0;
8257 coding->src_pos_byte = 0;
8259 else
8261 if (from < GPT && to >= GPT)
8262 move_gap_both (from, from_byte);
8263 coding->src_pos = from;
8264 coding->src_pos_byte = from_byte;
8267 else
8268 code_conversion_save (0, 0);
8270 if (BUFFERP (dst_object))
8272 coding->dst_object = dst_object;
8273 if (EQ (src_object, dst_object))
8275 coding->dst_pos = from;
8276 coding->dst_pos_byte = from_byte;
8278 else
8280 struct buffer *current = current_buffer;
8282 set_buffer_temp (XBUFFER (dst_object));
8283 coding->dst_pos = PT;
8284 coding->dst_pos_byte = PT_BYTE;
8285 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8286 set_buffer_temp (current);
8288 coding->dst_multibyte
8289 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8291 else if (EQ (dst_object, Qt))
8293 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8294 coding->dst_object = Qnil;
8295 coding->destination = xmalloc (dst_bytes);
8296 coding->dst_bytes = dst_bytes;
8297 coding->dst_multibyte = 0;
8299 else
8301 coding->dst_object = Qnil;
8302 coding->dst_multibyte = 0;
8305 encode_coding (coding);
8307 if (EQ (dst_object, Qt))
8309 if (BUFFERP (coding->dst_object))
8310 coding->dst_object = Fbuffer_string ();
8311 else
8313 coding->dst_object
8314 = make_unibyte_string ((char *) coding->destination,
8315 coding->produced);
8316 xfree (coding->destination);
8320 if (saved_pt >= 0)
8322 /* This is the case of:
8323 (BUFFERP (src_object) && EQ (src_object, dst_object))
8324 As we have moved PT while replacing the original buffer
8325 contents, we must recover it now. */
8326 set_buffer_internal (XBUFFER (src_object));
8327 if (saved_pt < from)
8328 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8329 else if (saved_pt < from + chars)
8330 TEMP_SET_PT_BOTH (from, from_byte);
8331 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8332 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8333 saved_pt_byte + (coding->produced - bytes));
8334 else
8335 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8336 saved_pt_byte + (coding->produced - bytes));
8338 if (need_marker_adjustment)
8340 struct Lisp_Marker *tail;
8342 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8343 if (tail->need_adjustment)
8345 tail->need_adjustment = 0;
8346 if (tail->insertion_type)
8348 tail->bytepos = from_byte;
8349 tail->charpos = from;
8351 else
8353 tail->bytepos = from_byte + coding->produced;
8354 tail->charpos
8355 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8356 ? tail->bytepos : from + coding->produced_char);
8362 if (kill_src_buffer)
8363 Fkill_buffer (coding->src_object);
8365 Vdeactivate_mark = old_deactivate_mark;
8366 unbind_to (count, Qnil);
8370 Lisp_Object
8371 preferred_coding_system (void)
8373 int id = coding_categories[coding_priorities[0]].id;
8375 return CODING_ID_NAME (id);
8378 #if defined (WINDOWSNT) || defined (CYGWIN)
8380 Lisp_Object
8381 from_unicode (Lisp_Object str)
8383 CHECK_STRING (str);
8384 if (!STRING_MULTIBYTE (str) &&
8385 SBYTES (str) & 1)
8387 str = Fsubstring (str, make_number (0), make_number (-1));
8390 return code_convert_string_norecord (str, Qutf_16le, 0);
8393 Lisp_Object
8394 from_unicode_buffer (const wchar_t* wstr)
8396 return from_unicode (
8397 make_unibyte_string (
8398 (char*) wstr,
8399 /* we get one of the two final 0 bytes for free. */
8400 1 + sizeof (wchar_t) * wcslen (wstr)));
8403 wchar_t *
8404 to_unicode (Lisp_Object str, Lisp_Object *buf)
8406 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8407 /* We need to make another copy (in addition to the one made by
8408 code_convert_string_norecord) to ensure that the final string is
8409 _doubly_ zero terminated --- that is, that the string is
8410 terminated by two zero bytes and one utf-16le null character.
8411 Because strings are already terminated with a single zero byte,
8412 we just add one additional zero. */
8413 str = make_uninit_string (SBYTES (*buf) + 1);
8414 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8415 SDATA (str) [SBYTES (*buf)] = '\0';
8416 *buf = str;
8417 return WCSDATA (*buf);
8420 #endif /* WINDOWSNT || CYGWIN */
8423 #ifdef emacs
8424 /*** 8. Emacs Lisp library functions ***/
8426 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8427 doc: /* Return t if OBJECT is nil or a coding-system.
8428 See the documentation of `define-coding-system' for information
8429 about coding-system objects. */)
8430 (Lisp_Object object)
8432 if (NILP (object)
8433 || CODING_SYSTEM_ID (object) >= 0)
8434 return Qt;
8435 if (! SYMBOLP (object)
8436 || NILP (Fget (object, Qcoding_system_define_form)))
8437 return Qnil;
8438 return Qt;
8441 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8442 Sread_non_nil_coding_system, 1, 1, 0,
8443 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8444 (Lisp_Object prompt)
8446 Lisp_Object val;
8449 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8450 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8452 while (SCHARS (val) == 0);
8453 return (Fintern (val, Qnil));
8456 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8457 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8458 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8459 Ignores case when completing coding systems (all Emacs coding systems
8460 are lower-case). */)
8461 (Lisp_Object prompt, Lisp_Object default_coding_system)
8463 Lisp_Object val;
8464 ptrdiff_t count = SPECPDL_INDEX ();
8466 if (SYMBOLP (default_coding_system))
8467 default_coding_system = SYMBOL_NAME (default_coding_system);
8468 specbind (Qcompletion_ignore_case, Qt);
8469 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8470 Qt, Qnil, Qcoding_system_history,
8471 default_coding_system, Qnil);
8472 unbind_to (count, Qnil);
8473 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8476 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8477 1, 1, 0,
8478 doc: /* Check validity of CODING-SYSTEM.
8479 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8480 It is valid if it is nil or a symbol defined as a coding system by the
8481 function `define-coding-system'. */)
8482 (Lisp_Object coding_system)
8484 Lisp_Object define_form;
8486 define_form = Fget (coding_system, Qcoding_system_define_form);
8487 if (! NILP (define_form))
8489 Fput (coding_system, Qcoding_system_define_form, Qnil);
8490 safe_eval (define_form);
8492 if (!NILP (Fcoding_system_p (coding_system)))
8493 return coding_system;
8494 xsignal1 (Qcoding_system_error, coding_system);
8498 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8499 HIGHEST, return the coding system of the highest
8500 priority among the detected coding systems. Otherwise return a
8501 list of detected coding systems sorted by their priorities. If
8502 MULTIBYTEP, it is assumed that the bytes are in correct
8503 multibyte form but contains only ASCII and eight-bit chars.
8504 Otherwise, the bytes are raw bytes.
8506 CODING-SYSTEM controls the detection as below:
8508 If it is nil, detect both text-format and eol-format. If the
8509 text-format part of CODING-SYSTEM is already specified
8510 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8511 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8512 detect only text-format. */
8514 Lisp_Object
8515 detect_coding_system (const unsigned char *src,
8516 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8517 bool highest, bool multibytep,
8518 Lisp_Object coding_system)
8520 const unsigned char *src_end = src + src_bytes;
8521 Lisp_Object attrs, eol_type;
8522 Lisp_Object val = Qnil;
8523 struct coding_system coding;
8524 ptrdiff_t id;
8525 struct coding_detection_info detect_info;
8526 enum coding_category base_category;
8527 bool null_byte_found = 0, eight_bit_found = 0;
8529 if (NILP (coding_system))
8530 coding_system = Qundecided;
8531 setup_coding_system (coding_system, &coding);
8532 attrs = CODING_ID_ATTRS (coding.id);
8533 eol_type = CODING_ID_EOL_TYPE (coding.id);
8534 coding_system = CODING_ATTR_BASE_NAME (attrs);
8536 coding.source = src;
8537 coding.src_chars = src_chars;
8538 coding.src_bytes = src_bytes;
8539 coding.src_multibyte = multibytep;
8540 coding.consumed = 0;
8541 coding.mode |= CODING_MODE_LAST_BLOCK;
8542 coding.head_ascii = 0;
8544 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8546 /* At first, detect text-format if necessary. */
8547 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8548 if (base_category == coding_category_undecided)
8550 enum coding_category category IF_LINT (= 0);
8551 struct coding_system *this IF_LINT (= NULL);
8552 int c, i;
8553 bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8554 inhibit_null_byte_detection);
8555 bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8556 inhibit_iso_escape_detection);
8557 bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8559 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8560 for (; src < src_end; src++)
8562 c = *src;
8563 if (c & 0x80)
8565 eight_bit_found = 1;
8566 if (null_byte_found)
8567 break;
8569 else if (c < 0x20)
8571 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8572 && ! inhibit_ied
8573 && ! detect_info.checked)
8575 if (detect_coding_iso_2022 (&coding, &detect_info))
8577 /* We have scanned the whole data. */
8578 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8580 /* We didn't find an 8-bit code. We may
8581 have found a null-byte, but it's very
8582 rare that a binary file confirm to
8583 ISO-2022. */
8584 src = src_end;
8585 coding.head_ascii = src - coding.source;
8587 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8588 break;
8591 else if (! c && !inhibit_nbd)
8593 null_byte_found = 1;
8594 if (eight_bit_found)
8595 break;
8597 if (! eight_bit_found)
8598 coding.head_ascii++;
8600 else if (! eight_bit_found)
8601 coding.head_ascii++;
8604 if (null_byte_found || eight_bit_found
8605 || coding.head_ascii < coding.src_bytes
8606 || detect_info.found)
8608 if (coding.head_ascii == coding.src_bytes)
8609 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8610 for (i = 0; i < coding_category_raw_text; i++)
8612 category = coding_priorities[i];
8613 this = coding_categories + category;
8614 if (detect_info.found & (1 << category))
8615 break;
8617 else
8619 if (null_byte_found)
8621 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8622 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8624 else if (prefer_utf_8
8625 && detect_coding_utf_8 (&coding, &detect_info))
8627 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8628 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8630 for (i = 0; i < coding_category_raw_text; i++)
8632 category = coding_priorities[i];
8633 this = coding_categories + category;
8635 if (this->id < 0)
8637 /* No coding system of this category is defined. */
8638 detect_info.rejected |= (1 << category);
8640 else if (category >= coding_category_raw_text)
8641 continue;
8642 else if (detect_info.checked & (1 << category))
8644 if (highest
8645 && (detect_info.found & (1 << category)))
8646 break;
8648 else if ((*(this->detector)) (&coding, &detect_info)
8649 && highest
8650 && (detect_info.found & (1 << category)))
8652 if (category == coding_category_utf_16_auto)
8654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8655 category = coding_category_utf_16_le;
8656 else
8657 category = coding_category_utf_16_be;
8659 break;
8665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8666 || null_byte_found)
8668 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8669 id = CODING_SYSTEM_ID (Qno_conversion);
8670 val = Fcons (make_number (id), Qnil);
8672 else if (! detect_info.rejected && ! detect_info.found)
8674 detect_info.found = CATEGORY_MASK_ANY;
8675 id = coding_categories[coding_category_undecided].id;
8676 val = Fcons (make_number (id), Qnil);
8678 else if (highest)
8680 if (detect_info.found)
8682 detect_info.found = 1 << category;
8683 val = Fcons (make_number (this->id), Qnil);
8685 else
8686 for (i = 0; i < coding_category_raw_text; i++)
8687 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8689 detect_info.found = 1 << coding_priorities[i];
8690 id = coding_categories[coding_priorities[i]].id;
8691 val = Fcons (make_number (id), Qnil);
8692 break;
8695 else
8697 int mask = detect_info.rejected | detect_info.found;
8698 int found = 0;
8700 for (i = coding_category_raw_text - 1; i >= 0; i--)
8702 category = coding_priorities[i];
8703 if (! (mask & (1 << category)))
8705 found |= 1 << category;
8706 id = coding_categories[category].id;
8707 if (id >= 0)
8708 val = Fcons (make_number (id), val);
8711 for (i = coding_category_raw_text - 1; i >= 0; i--)
8713 category = coding_priorities[i];
8714 if (detect_info.found & (1 << category))
8716 id = coding_categories[category].id;
8717 val = Fcons (make_number (id), val);
8720 detect_info.found |= found;
8723 else if (base_category == coding_category_utf_8_auto)
8725 if (detect_coding_utf_8 (&coding, &detect_info))
8727 struct coding_system *this;
8729 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8730 this = coding_categories + coding_category_utf_8_sig;
8731 else
8732 this = coding_categories + coding_category_utf_8_nosig;
8733 val = Fcons (make_number (this->id), Qnil);
8736 else if (base_category == coding_category_utf_16_auto)
8738 if (detect_coding_utf_16 (&coding, &detect_info))
8740 struct coding_system *this;
8742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8743 this = coding_categories + coding_category_utf_16_le;
8744 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8745 this = coding_categories + coding_category_utf_16_be;
8746 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8747 this = coding_categories + coding_category_utf_16_be_nosig;
8748 else
8749 this = coding_categories + coding_category_utf_16_le_nosig;
8750 val = Fcons (make_number (this->id), Qnil);
8753 else
8755 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8756 val = Fcons (make_number (coding.id), Qnil);
8759 /* Then, detect eol-format if necessary. */
8761 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8762 Lisp_Object tail;
8764 if (VECTORP (eol_type))
8766 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8768 if (null_byte_found)
8769 normal_eol = EOL_SEEN_LF;
8770 else
8771 normal_eol = detect_eol (coding.source, src_bytes,
8772 coding_category_raw_text);
8774 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8775 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8776 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8777 coding_category_utf_16_be);
8778 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8779 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8780 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8781 coding_category_utf_16_le);
8783 else
8785 if (EQ (eol_type, Qunix))
8786 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8787 else if (EQ (eol_type, Qdos))
8788 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8789 else
8790 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8793 for (tail = val; CONSP (tail); tail = XCDR (tail))
8795 enum coding_category category;
8796 int this_eol;
8798 id = XINT (XCAR (tail));
8799 attrs = CODING_ID_ATTRS (id);
8800 category = XINT (CODING_ATTR_CATEGORY (attrs));
8801 eol_type = CODING_ID_EOL_TYPE (id);
8802 if (VECTORP (eol_type))
8804 if (category == coding_category_utf_16_be
8805 || category == coding_category_utf_16_be_nosig)
8806 this_eol = utf_16_be_eol;
8807 else if (category == coding_category_utf_16_le
8808 || category == coding_category_utf_16_le_nosig)
8809 this_eol = utf_16_le_eol;
8810 else
8811 this_eol = normal_eol;
8813 if (this_eol == EOL_SEEN_LF)
8814 XSETCAR (tail, AREF (eol_type, 0));
8815 else if (this_eol == EOL_SEEN_CRLF)
8816 XSETCAR (tail, AREF (eol_type, 1));
8817 else if (this_eol == EOL_SEEN_CR)
8818 XSETCAR (tail, AREF (eol_type, 2));
8819 else
8820 XSETCAR (tail, CODING_ID_NAME (id));
8822 else
8823 XSETCAR (tail, CODING_ID_NAME (id));
8827 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8831 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8832 2, 3, 0,
8833 doc: /* Detect coding system of the text in the region between START and END.
8834 Return a list of possible coding systems ordered by priority.
8835 The coding systems to try and their priorities follows what
8836 the function `coding-system-priority-list' (which see) returns.
8838 If only ASCII characters are found (except for such ISO-2022 control
8839 characters as ESC), it returns a list of single element `undecided'
8840 or its subsidiary coding system according to a detected end-of-line
8841 format.
8843 If optional argument HIGHEST is non-nil, return the coding system of
8844 highest priority. */)
8845 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8847 ptrdiff_t from, to;
8848 ptrdiff_t from_byte, to_byte;
8850 validate_region (&start, &end);
8851 from = XINT (start), to = XINT (end);
8852 from_byte = CHAR_TO_BYTE (from);
8853 to_byte = CHAR_TO_BYTE (to);
8855 if (from < GPT && to >= GPT)
8856 move_gap_both (to, to_byte);
8858 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8859 to - from, to_byte - from_byte,
8860 !NILP (highest),
8861 !NILP (BVAR (current_buffer
8862 , enable_multibyte_characters)),
8863 Qnil);
8866 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8867 1, 2, 0,
8868 doc: /* Detect coding system of the text in STRING.
8869 Return a list of possible coding systems ordered by priority.
8870 The coding systems to try and their priorities follows what
8871 the function `coding-system-priority-list' (which see) returns.
8873 If only ASCII characters are found (except for such ISO-2022 control
8874 characters as ESC), it returns a list of single element `undecided'
8875 or its subsidiary coding system according to a detected end-of-line
8876 format.
8878 If optional argument HIGHEST is non-nil, return the coding system of
8879 highest priority. */)
8880 (Lisp_Object string, Lisp_Object highest)
8882 CHECK_STRING (string);
8884 return detect_coding_system (SDATA (string),
8885 SCHARS (string), SBYTES (string),
8886 !NILP (highest), STRING_MULTIBYTE (string),
8887 Qnil);
8891 static bool
8892 char_encodable_p (int c, Lisp_Object attrs)
8894 Lisp_Object tail;
8895 struct charset *charset;
8896 Lisp_Object translation_table;
8898 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8899 if (! NILP (translation_table))
8900 c = translate_char (translation_table, c);
8901 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8902 CONSP (tail); tail = XCDR (tail))
8904 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8905 if (CHAR_CHARSET_P (c, charset))
8906 break;
8908 return (! NILP (tail));
8912 /* Return a list of coding systems that safely encode the text between
8913 START and END. If EXCLUDE is non-nil, it is a list of coding
8914 systems not to check. The returned list doesn't contain any such
8915 coding systems. In any case, if the text contains only ASCII or is
8916 unibyte, return t. */
8918 DEFUN ("find-coding-systems-region-internal",
8919 Ffind_coding_systems_region_internal,
8920 Sfind_coding_systems_region_internal, 2, 3, 0,
8921 doc: /* Internal use only. */)
8922 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8924 Lisp_Object coding_attrs_list, safe_codings;
8925 ptrdiff_t start_byte, end_byte;
8926 const unsigned char *p, *pbeg, *pend;
8927 int c;
8928 Lisp_Object tail, elt, work_table;
8930 if (STRINGP (start))
8932 if (!STRING_MULTIBYTE (start)
8933 || SCHARS (start) == SBYTES (start))
8934 return Qt;
8935 start_byte = 0;
8936 end_byte = SBYTES (start);
8938 else
8940 CHECK_NUMBER_COERCE_MARKER (start);
8941 CHECK_NUMBER_COERCE_MARKER (end);
8942 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8943 args_out_of_range (start, end);
8944 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8945 return Qt;
8946 start_byte = CHAR_TO_BYTE (XINT (start));
8947 end_byte = CHAR_TO_BYTE (XINT (end));
8948 if (XINT (end) - XINT (start) == end_byte - start_byte)
8949 return Qt;
8951 if (XINT (start) < GPT && XINT (end) > GPT)
8953 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8954 move_gap_both (XINT (start), start_byte);
8955 else
8956 move_gap_both (XINT (end), end_byte);
8960 coding_attrs_list = Qnil;
8961 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8962 if (NILP (exclude)
8963 || NILP (Fmemq (XCAR (tail), exclude)))
8965 Lisp_Object attrs;
8967 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8968 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8970 ASET (attrs, coding_attr_trans_tbl,
8971 get_translation_table (attrs, 1, NULL));
8972 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8976 if (STRINGP (start))
8977 p = pbeg = SDATA (start);
8978 else
8979 p = pbeg = BYTE_POS_ADDR (start_byte);
8980 pend = p + (end_byte - start_byte);
8982 while (p < pend && ASCII_BYTE_P (*p)) p++;
8983 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8985 work_table = Fmake_char_table (Qnil, Qnil);
8986 while (p < pend)
8988 if (ASCII_BYTE_P (*p))
8989 p++;
8990 else
8992 c = STRING_CHAR_ADVANCE (p);
8993 if (!NILP (char_table_ref (work_table, c)))
8994 /* This character was already checked. Ignore it. */
8995 continue;
8997 charset_map_loaded = 0;
8998 for (tail = coding_attrs_list; CONSP (tail);)
9000 elt = XCAR (tail);
9001 if (NILP (elt))
9002 tail = XCDR (tail);
9003 else if (char_encodable_p (c, elt))
9004 tail = XCDR (tail);
9005 else if (CONSP (XCDR (tail)))
9007 XSETCAR (tail, XCAR (XCDR (tail)));
9008 XSETCDR (tail, XCDR (XCDR (tail)));
9010 else
9012 XSETCAR (tail, Qnil);
9013 tail = XCDR (tail);
9016 if (charset_map_loaded)
9018 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9020 if (STRINGP (start))
9021 pbeg = SDATA (start);
9022 else
9023 pbeg = BYTE_POS_ADDR (start_byte);
9024 p = pbeg + p_offset;
9025 pend = pbeg + pend_offset;
9027 char_table_set (work_table, c, Qt);
9031 safe_codings = list2 (Qraw_text, Qno_conversion);
9032 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9033 if (! NILP (XCAR (tail)))
9034 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9036 return safe_codings;
9040 DEFUN ("unencodable-char-position", Funencodable_char_position,
9041 Sunencodable_char_position, 3, 5, 0,
9042 doc: /*
9043 Return position of first un-encodable character in a region.
9044 START and END specify the region and CODING-SYSTEM specifies the
9045 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9047 If optional 4th argument COUNT is non-nil, it specifies at most how
9048 many un-encodable characters to search. In this case, the value is a
9049 list of positions.
9051 If optional 5th argument STRING is non-nil, it is a string to search
9052 for un-encodable characters. In that case, START and END are indexes
9053 to the string. */)
9054 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9056 EMACS_INT n;
9057 struct coding_system coding;
9058 Lisp_Object attrs, charset_list, translation_table;
9059 Lisp_Object positions;
9060 ptrdiff_t from, to;
9061 const unsigned char *p, *stop, *pend;
9062 bool ascii_compatible;
9064 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9065 attrs = CODING_ID_ATTRS (coding.id);
9066 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9067 return Qnil;
9068 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9069 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9070 translation_table = get_translation_table (attrs, 1, NULL);
9072 if (NILP (string))
9074 validate_region (&start, &end);
9075 from = XINT (start);
9076 to = XINT (end);
9077 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9078 || (ascii_compatible
9079 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9080 return Qnil;
9081 p = CHAR_POS_ADDR (from);
9082 pend = CHAR_POS_ADDR (to);
9083 if (from < GPT && to >= GPT)
9084 stop = GPT_ADDR;
9085 else
9086 stop = pend;
9088 else
9090 CHECK_STRING (string);
9091 CHECK_NATNUM (start);
9092 CHECK_NATNUM (end);
9093 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9094 args_out_of_range_3 (string, start, end);
9095 from = XINT (start);
9096 to = XINT (end);
9097 if (! STRING_MULTIBYTE (string))
9098 return Qnil;
9099 p = SDATA (string) + string_char_to_byte (string, from);
9100 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9101 if (ascii_compatible && (to - from) == (pend - p))
9102 return Qnil;
9105 if (NILP (count))
9106 n = 1;
9107 else
9109 CHECK_NATNUM (count);
9110 n = XINT (count);
9113 positions = Qnil;
9114 charset_map_loaded = 0;
9115 while (1)
9117 int c;
9119 if (ascii_compatible)
9120 while (p < stop && ASCII_BYTE_P (*p))
9121 p++, from++;
9122 if (p >= stop)
9124 if (p >= pend)
9125 break;
9126 stop = pend;
9127 p = GAP_END_ADDR;
9130 c = STRING_CHAR_ADVANCE (p);
9131 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9132 && ! char_charset (translate_char (translation_table, c),
9133 charset_list, NULL))
9135 positions = Fcons (make_number (from), positions);
9136 n--;
9137 if (n == 0)
9138 break;
9141 from++;
9142 if (charset_map_loaded && NILP (string))
9144 p = CHAR_POS_ADDR (from);
9145 pend = CHAR_POS_ADDR (to);
9146 if (from < GPT && to >= GPT)
9147 stop = GPT_ADDR;
9148 else
9149 stop = pend;
9150 charset_map_loaded = 0;
9154 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9158 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9159 Scheck_coding_systems_region, 3, 3, 0,
9160 doc: /* Check if the region is encodable by coding systems.
9162 START and END are buffer positions specifying the region.
9163 CODING-SYSTEM-LIST is a list of coding systems to check.
9165 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9166 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9167 whole region, POS0, POS1, ... are buffer positions where non-encodable
9168 characters are found.
9170 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9171 value is nil.
9173 START may be a string. In that case, check if the string is
9174 encodable, and the value contains indices to the string instead of
9175 buffer positions. END is ignored.
9177 If the current buffer (or START if it is a string) is unibyte, the value
9178 is nil. */)
9179 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9181 Lisp_Object list;
9182 ptrdiff_t start_byte, end_byte;
9183 ptrdiff_t pos;
9184 const unsigned char *p, *pbeg, *pend;
9185 int c;
9186 Lisp_Object tail, elt, attrs;
9188 if (STRINGP (start))
9190 if (!STRING_MULTIBYTE (start)
9191 || SCHARS (start) == SBYTES (start))
9192 return Qnil;
9193 start_byte = 0;
9194 end_byte = SBYTES (start);
9195 pos = 0;
9197 else
9199 CHECK_NUMBER_COERCE_MARKER (start);
9200 CHECK_NUMBER_COERCE_MARKER (end);
9201 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9202 args_out_of_range (start, end);
9203 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9204 return Qnil;
9205 start_byte = CHAR_TO_BYTE (XINT (start));
9206 end_byte = CHAR_TO_BYTE (XINT (end));
9207 if (XINT (end) - XINT (start) == end_byte - start_byte)
9208 return Qnil;
9210 if (XINT (start) < GPT && XINT (end) > GPT)
9212 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9213 move_gap_both (XINT (start), start_byte);
9214 else
9215 move_gap_both (XINT (end), end_byte);
9217 pos = XINT (start);
9220 list = Qnil;
9221 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9223 elt = XCAR (tail);
9224 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9225 ASET (attrs, coding_attr_trans_tbl,
9226 get_translation_table (attrs, 1, NULL));
9227 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9230 if (STRINGP (start))
9231 p = pbeg = SDATA (start);
9232 else
9233 p = pbeg = BYTE_POS_ADDR (start_byte);
9234 pend = p + (end_byte - start_byte);
9236 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9237 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9239 while (p < pend)
9241 if (ASCII_BYTE_P (*p))
9242 p++;
9243 else
9245 c = STRING_CHAR_ADVANCE (p);
9247 charset_map_loaded = 0;
9248 for (tail = list; CONSP (tail); tail = XCDR (tail))
9250 elt = XCDR (XCAR (tail));
9251 if (! char_encodable_p (c, XCAR (elt)))
9252 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9254 if (charset_map_loaded)
9256 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9258 if (STRINGP (start))
9259 pbeg = SDATA (start);
9260 else
9261 pbeg = BYTE_POS_ADDR (start_byte);
9262 p = pbeg + p_offset;
9263 pend = pbeg + pend_offset;
9266 pos++;
9269 tail = list;
9270 list = Qnil;
9271 for (; CONSP (tail); tail = XCDR (tail))
9273 elt = XCAR (tail);
9274 if (CONSP (XCDR (XCDR (elt))))
9275 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9276 list);
9279 return list;
9283 static Lisp_Object
9284 code_convert_region (Lisp_Object start, Lisp_Object end,
9285 Lisp_Object coding_system, Lisp_Object dst_object,
9286 bool encodep, bool norecord)
9288 struct coding_system coding;
9289 ptrdiff_t from, from_byte, to, to_byte;
9290 Lisp_Object src_object;
9292 if (NILP (coding_system))
9293 coding_system = Qno_conversion;
9294 else
9295 CHECK_CODING_SYSTEM (coding_system);
9296 src_object = Fcurrent_buffer ();
9297 if (NILP (dst_object))
9298 dst_object = src_object;
9299 else if (! EQ (dst_object, Qt))
9300 CHECK_BUFFER (dst_object);
9302 validate_region (&start, &end);
9303 from = XFASTINT (start);
9304 from_byte = CHAR_TO_BYTE (from);
9305 to = XFASTINT (end);
9306 to_byte = CHAR_TO_BYTE (to);
9308 setup_coding_system (coding_system, &coding);
9309 coding.mode |= CODING_MODE_LAST_BLOCK;
9311 if (encodep)
9312 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313 dst_object);
9314 else
9315 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9316 dst_object);
9317 if (! norecord)
9318 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9320 return (BUFFERP (dst_object)
9321 ? make_number (coding.produced_char)
9322 : coding.dst_object);
9326 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9327 3, 4, "r\nzCoding system: ",
9328 doc: /* Decode the current region from the specified coding system.
9329 When called from a program, takes four arguments:
9330 START, END, CODING-SYSTEM, and DESTINATION.
9331 START and END are buffer positions.
9333 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9334 If nil, the region between START and END is replaced by the decoded text.
9335 If buffer, the decoded text is inserted in that buffer after point (point
9336 does not move).
9337 In those cases, the length of the decoded text is returned.
9338 If DESTINATION is t, the decoded text is returned.
9340 This function sets `last-coding-system-used' to the precise coding system
9341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9342 not fully specified.) */)
9343 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9345 return code_convert_region (start, end, coding_system, destination, 0, 0);
9348 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9349 3, 4, "r\nzCoding system: ",
9350 doc: /* Encode the current region by specified coding system.
9351 When called from a program, takes four arguments:
9352 START, END, CODING-SYSTEM and DESTINATION.
9353 START and END are buffer positions.
9355 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9356 If nil, the region between START and END is replace by the encoded text.
9357 If buffer, the encoded text is inserted in that buffer after point (point
9358 does not move).
9359 In those cases, the length of the encoded text is returned.
9360 If DESTINATION is t, the encoded text is returned.
9362 This function sets `last-coding-system-used' to the precise coding system
9363 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9364 not fully specified.) */)
9365 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9367 return code_convert_region (start, end, coding_system, destination, 1, 0);
9370 Lisp_Object
9371 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9372 Lisp_Object dst_object, bool encodep, bool nocopy,
9373 bool norecord)
9375 struct coding_system coding;
9376 ptrdiff_t chars, bytes;
9378 CHECK_STRING (string);
9379 if (NILP (coding_system))
9381 if (! norecord)
9382 Vlast_coding_system_used = Qno_conversion;
9383 if (NILP (dst_object))
9384 return (nocopy ? Fcopy_sequence (string) : string);
9387 if (NILP (coding_system))
9388 coding_system = Qno_conversion;
9389 else
9390 CHECK_CODING_SYSTEM (coding_system);
9391 if (NILP (dst_object))
9392 dst_object = Qt;
9393 else if (! EQ (dst_object, Qt))
9394 CHECK_BUFFER (dst_object);
9396 setup_coding_system (coding_system, &coding);
9397 coding.mode |= CODING_MODE_LAST_BLOCK;
9398 chars = SCHARS (string);
9399 bytes = SBYTES (string);
9400 if (encodep)
9401 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9402 else
9403 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9404 if (! norecord)
9405 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9407 return (BUFFERP (dst_object)
9408 ? make_number (coding.produced_char)
9409 : coding.dst_object);
9413 /* Encode or decode STRING according to CODING_SYSTEM.
9414 Do not set Vlast_coding_system_used.
9416 This function is called only from macros DECODE_FILE and
9417 ENCODE_FILE, thus we ignore character composition. */
9419 Lisp_Object
9420 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9421 bool encodep)
9423 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9427 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9428 2, 4, 0,
9429 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9431 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9432 if the decoding operation is trivial.
9434 Optional fourth arg BUFFER non-nil means that the decoded text is
9435 inserted in that buffer after point (point does not move). In this
9436 case, the return value is the length of the decoded text.
9438 This function sets `last-coding-system-used' to the precise coding system
9439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9440 not fully specified.) */)
9441 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9443 return code_convert_string (string, coding_system, buffer,
9444 0, ! NILP (nocopy), 0);
9447 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9448 2, 4, 0,
9449 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9451 Optional third arg NOCOPY non-nil means it is OK to return STRING
9452 itself if the encoding operation is trivial.
9454 Optional fourth arg BUFFER non-nil means that the encoded text is
9455 inserted in that buffer after point (point does not move). In this
9456 case, the return value is the length of the encoded text.
9458 This function sets `last-coding-system-used' to the precise coding system
9459 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9460 not fully specified.) */)
9461 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9463 return code_convert_string (string, coding_system, buffer,
9464 1, ! NILP (nocopy), 0);
9468 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9469 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9470 Return the corresponding character. */)
9471 (Lisp_Object code)
9473 Lisp_Object spec, attrs, val;
9474 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9475 EMACS_INT ch;
9476 int c;
9478 CHECK_NATNUM (code);
9479 ch = XFASTINT (code);
9480 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9481 attrs = AREF (spec, 0);
9483 if (ASCII_BYTE_P (ch)
9484 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9485 return code;
9487 val = CODING_ATTR_CHARSET_LIST (attrs);
9488 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9489 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9490 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9492 if (ch <= 0x7F)
9494 c = ch;
9495 charset = charset_roman;
9497 else if (ch >= 0xA0 && ch < 0xDF)
9499 c = ch - 0x80;
9500 charset = charset_kana;
9502 else
9504 EMACS_INT c1 = ch >> 8;
9505 int c2 = ch & 0xFF;
9507 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9508 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9509 error ("Invalid code: %"pI"d", ch);
9510 c = ch;
9511 SJIS_TO_JIS (c);
9512 charset = charset_kanji;
9514 c = DECODE_CHAR (charset, c);
9515 if (c < 0)
9516 error ("Invalid code: %"pI"d", ch);
9517 return make_number (c);
9521 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9522 doc: /* Encode a Japanese character CH to shift_jis encoding.
9523 Return the corresponding code in SJIS. */)
9524 (Lisp_Object ch)
9526 Lisp_Object spec, attrs, charset_list;
9527 int c;
9528 struct charset *charset;
9529 unsigned code;
9531 CHECK_CHARACTER (ch);
9532 c = XFASTINT (ch);
9533 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9534 attrs = AREF (spec, 0);
9536 if (ASCII_CHAR_P (c)
9537 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9538 return ch;
9540 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9541 charset = char_charset (c, charset_list, &code);
9542 if (code == CHARSET_INVALID_CODE (charset))
9543 error ("Can't encode by shift_jis encoding: %c", c);
9544 JIS_TO_SJIS (code);
9546 return make_number (code);
9549 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9550 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9551 Return the corresponding character. */)
9552 (Lisp_Object code)
9554 Lisp_Object spec, attrs, val;
9555 struct charset *charset_roman, *charset_big5, *charset;
9556 EMACS_INT ch;
9557 int c;
9559 CHECK_NATNUM (code);
9560 ch = XFASTINT (code);
9561 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9562 attrs = AREF (spec, 0);
9564 if (ASCII_BYTE_P (ch)
9565 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9566 return code;
9568 val = CODING_ATTR_CHARSET_LIST (attrs);
9569 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9570 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9572 if (ch <= 0x7F)
9574 c = ch;
9575 charset = charset_roman;
9577 else
9579 EMACS_INT b1 = ch >> 8;
9580 int b2 = ch & 0x7F;
9581 if (b1 < 0xA1 || b1 > 0xFE
9582 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9583 error ("Invalid code: %"pI"d", ch);
9584 c = ch;
9585 charset = charset_big5;
9587 c = DECODE_CHAR (charset, c);
9588 if (c < 0)
9589 error ("Invalid code: %"pI"d", ch);
9590 return make_number (c);
9593 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9594 doc: /* Encode the Big5 character CH to BIG5 coding system.
9595 Return the corresponding character code in Big5. */)
9596 (Lisp_Object ch)
9598 Lisp_Object spec, attrs, charset_list;
9599 struct charset *charset;
9600 int c;
9601 unsigned code;
9603 CHECK_CHARACTER (ch);
9604 c = XFASTINT (ch);
9605 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9606 attrs = AREF (spec, 0);
9607 if (ASCII_CHAR_P (c)
9608 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9609 return ch;
9611 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9612 charset = char_charset (c, charset_list, &code);
9613 if (code == CHARSET_INVALID_CODE (charset))
9614 error ("Can't encode by Big5 encoding: %c", c);
9616 return make_number (code);
9620 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9621 Sset_terminal_coding_system_internal, 1, 2, 0,
9622 doc: /* Internal use only. */)
9623 (Lisp_Object coding_system, Lisp_Object terminal)
9625 struct terminal *term = get_terminal (terminal, 1);
9626 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9627 CHECK_SYMBOL (coding_system);
9628 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9629 /* We had better not send unsafe characters to terminal. */
9630 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9631 /* Character composition should be disabled. */
9632 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9633 terminal_coding->src_multibyte = 1;
9634 terminal_coding->dst_multibyte = 0;
9635 tset_charset_list
9636 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9637 ? coding_charset_list (terminal_coding)
9638 : Fcons (make_number (charset_ascii), Qnil)));
9639 return Qnil;
9642 DEFUN ("set-safe-terminal-coding-system-internal",
9643 Fset_safe_terminal_coding_system_internal,
9644 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9645 doc: /* Internal use only. */)
9646 (Lisp_Object coding_system)
9648 CHECK_SYMBOL (coding_system);
9649 setup_coding_system (Fcheck_coding_system (coding_system),
9650 &safe_terminal_coding);
9651 /* Character composition should be disabled. */
9652 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9653 safe_terminal_coding.src_multibyte = 1;
9654 safe_terminal_coding.dst_multibyte = 0;
9655 return Qnil;
9658 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9659 Sterminal_coding_system, 0, 1, 0,
9660 doc: /* Return coding system specified for terminal output on the given terminal.
9661 TERMINAL may be a terminal object, a frame, or nil for the selected
9662 frame's terminal device. */)
9663 (Lisp_Object terminal)
9665 struct coding_system *terminal_coding
9666 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9667 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9669 /* For backward compatibility, return nil if it is `undecided'. */
9670 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9673 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9674 Sset_keyboard_coding_system_internal, 1, 2, 0,
9675 doc: /* Internal use only. */)
9676 (Lisp_Object coding_system, Lisp_Object terminal)
9678 struct terminal *t = get_terminal (terminal, 1);
9679 CHECK_SYMBOL (coding_system);
9680 if (NILP (coding_system))
9681 coding_system = Qno_conversion;
9682 else
9683 Fcheck_coding_system (coding_system);
9684 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9685 /* Character composition should be disabled. */
9686 TERMINAL_KEYBOARD_CODING (t)->common_flags
9687 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9688 return Qnil;
9691 DEFUN ("keyboard-coding-system",
9692 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9693 doc: /* Return coding system specified for decoding keyboard input. */)
9694 (Lisp_Object terminal)
9696 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9697 (get_terminal (terminal, 1))->id);
9701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9702 Sfind_operation_coding_system, 1, MANY, 0,
9703 doc: /* Choose a coding system for an operation based on the target name.
9704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9705 DECODING-SYSTEM is the coding system to use for decoding
9706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9707 for encoding (in case OPERATION does encoding).
9709 The first argument OPERATION specifies an I/O primitive:
9710 For file I/O, `insert-file-contents' or `write-region'.
9711 For process I/O, `call-process', `call-process-region', or `start-process'.
9712 For network I/O, `open-network-stream'.
9714 The remaining arguments should be the same arguments that were passed
9715 to the primitive. Depending on which primitive, one of those arguments
9716 is selected as the TARGET. For example, if OPERATION does file I/O,
9717 whichever argument specifies the file name is TARGET.
9719 TARGET has a meaning which depends on OPERATION:
9720 For file I/O, TARGET is a file name (except for the special case below).
9721 For process I/O, TARGET is a process name.
9722 For network I/O, TARGET is a service name or a port number.
9724 This function looks up what is specified for TARGET in
9725 `file-coding-system-alist', `process-coding-system-alist',
9726 or `network-coding-system-alist' depending on OPERATION.
9727 They may specify a coding system, a cons of coding systems,
9728 or a function symbol to call.
9729 In the last case, we call the function with one argument,
9730 which is a list of all the arguments given to this function.
9731 If the function can't decide a coding system, it can return
9732 `undecided' so that the normal code-detection is performed.
9734 If OPERATION is `insert-file-contents', the argument corresponding to
9735 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9736 file name to look up, and BUFFER is a buffer that contains the file's
9737 contents (not yet decoded). If `file-coding-system-alist' specifies a
9738 function to call for FILENAME, that function should examine the
9739 contents of BUFFER instead of reading the file.
9741 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9742 (ptrdiff_t nargs, Lisp_Object *args)
9744 Lisp_Object operation, target_idx, target, val;
9745 register Lisp_Object chain;
9747 if (nargs < 2)
9748 error ("Too few arguments");
9749 operation = args[0];
9750 if (!SYMBOLP (operation)
9751 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9752 error ("Invalid first argument");
9753 if (nargs <= 1 + XFASTINT (target_idx))
9754 error ("Too few arguments for operation `%s'",
9755 SDATA (SYMBOL_NAME (operation)));
9756 target = args[XFASTINT (target_idx) + 1];
9757 if (!(STRINGP (target)
9758 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9759 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9760 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9761 error ("Invalid argument %"pI"d of operation `%s'",
9762 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9763 if (CONSP (target))
9764 target = XCAR (target);
9766 chain = ((EQ (operation, Qinsert_file_contents)
9767 || EQ (operation, Qwrite_region))
9768 ? Vfile_coding_system_alist
9769 : (EQ (operation, Qopen_network_stream)
9770 ? Vnetwork_coding_system_alist
9771 : Vprocess_coding_system_alist));
9772 if (NILP (chain))
9773 return Qnil;
9775 for (; CONSP (chain); chain = XCDR (chain))
9777 Lisp_Object elt;
9779 elt = XCAR (chain);
9780 if (CONSP (elt)
9781 && ((STRINGP (target)
9782 && STRINGP (XCAR (elt))
9783 && fast_string_match (XCAR (elt), target) >= 0)
9784 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9786 val = XCDR (elt);
9787 /* Here, if VAL is both a valid coding system and a valid
9788 function symbol, we return VAL as a coding system. */
9789 if (CONSP (val))
9790 return val;
9791 if (! SYMBOLP (val))
9792 return Qnil;
9793 if (! NILP (Fcoding_system_p (val)))
9794 return Fcons (val, val);
9795 if (! NILP (Ffboundp (val)))
9797 /* We use call1 rather than safe_call1
9798 so as to get bug reports about functions called here
9799 which don't handle the current interface. */
9800 val = call1 (val, Flist (nargs, args));
9801 if (CONSP (val))
9802 return val;
9803 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9804 return Fcons (val, val);
9806 return Qnil;
9809 return Qnil;
9812 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9813 Sset_coding_system_priority, 0, MANY, 0,
9814 doc: /* Assign higher priority to the coding systems given as arguments.
9815 If multiple coding systems belong to the same category,
9816 all but the first one are ignored.
9818 usage: (set-coding-system-priority &rest coding-systems) */)
9819 (ptrdiff_t nargs, Lisp_Object *args)
9821 ptrdiff_t i, j;
9822 bool changed[coding_category_max];
9823 enum coding_category priorities[coding_category_max];
9825 memset (changed, 0, sizeof changed);
9827 for (i = j = 0; i < nargs; i++)
9829 enum coding_category category;
9830 Lisp_Object spec, attrs;
9832 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9833 attrs = AREF (spec, 0);
9834 category = XINT (CODING_ATTR_CATEGORY (attrs));
9835 if (changed[category])
9836 /* Ignore this coding system because a coding system of the
9837 same category already had a higher priority. */
9838 continue;
9839 changed[category] = 1;
9840 priorities[j++] = category;
9841 if (coding_categories[category].id >= 0
9842 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9843 setup_coding_system (args[i], &coding_categories[category]);
9844 Fset (AREF (Vcoding_category_table, category), args[i]);
9847 /* Now we have decided top J priorities. Reflect the order of the
9848 original priorities to the remaining priorities. */
9850 for (i = j, j = 0; i < coding_category_max; i++, j++)
9852 while (j < coding_category_max
9853 && changed[coding_priorities[j]])
9854 j++;
9855 if (j == coding_category_max)
9856 emacs_abort ();
9857 priorities[i] = coding_priorities[j];
9860 memcpy (coding_priorities, priorities, sizeof priorities);
9862 /* Update `coding-category-list'. */
9863 Vcoding_category_list = Qnil;
9864 for (i = coding_category_max; i-- > 0; )
9865 Vcoding_category_list
9866 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9867 Vcoding_category_list);
9869 return Qnil;
9872 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9873 Scoding_system_priority_list, 0, 1, 0,
9874 doc: /* Return a list of coding systems ordered by their priorities.
9875 The list contains a subset of coding systems; i.e. coding systems
9876 assigned to each coding category (see `coding-category-list').
9878 HIGHESTP non-nil means just return the highest priority one. */)
9879 (Lisp_Object highestp)
9881 int i;
9882 Lisp_Object val;
9884 for (i = 0, val = Qnil; i < coding_category_max; i++)
9886 enum coding_category category = coding_priorities[i];
9887 int id = coding_categories[category].id;
9888 Lisp_Object attrs;
9890 if (id < 0)
9891 continue;
9892 attrs = CODING_ID_ATTRS (id);
9893 if (! NILP (highestp))
9894 return CODING_ATTR_BASE_NAME (attrs);
9895 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9897 return Fnreverse (val);
9900 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9902 static Lisp_Object
9903 make_subsidiaries (Lisp_Object base)
9905 Lisp_Object subsidiaries;
9906 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9907 char *buf = alloca (base_name_len + 6);
9908 int i;
9910 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9911 subsidiaries = make_uninit_vector (3);
9912 for (i = 0; i < 3; i++)
9914 strcpy (buf + base_name_len, suffixes[i]);
9915 ASET (subsidiaries, i, intern (buf));
9917 return subsidiaries;
9921 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9922 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9923 doc: /* For internal use only.
9924 usage: (define-coding-system-internal ...) */)
9925 (ptrdiff_t nargs, Lisp_Object *args)
9927 Lisp_Object name;
9928 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9929 Lisp_Object attrs; /* Vector of attributes. */
9930 Lisp_Object eol_type;
9931 Lisp_Object aliases;
9932 Lisp_Object coding_type, charset_list, safe_charsets;
9933 enum coding_category category;
9934 Lisp_Object tail, val;
9935 int max_charset_id = 0;
9936 int i;
9938 if (nargs < coding_arg_max)
9939 goto short_args;
9941 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9943 name = args[coding_arg_name];
9944 CHECK_SYMBOL (name);
9945 ASET (attrs, coding_attr_base_name, name);
9947 val = args[coding_arg_mnemonic];
9948 if (! STRINGP (val))
9949 CHECK_CHARACTER (val);
9950 ASET (attrs, coding_attr_mnemonic, val);
9952 coding_type = args[coding_arg_coding_type];
9953 CHECK_SYMBOL (coding_type);
9954 ASET (attrs, coding_attr_type, coding_type);
9956 charset_list = args[coding_arg_charset_list];
9957 if (SYMBOLP (charset_list))
9959 if (EQ (charset_list, Qiso_2022))
9961 if (! EQ (coding_type, Qiso_2022))
9962 error ("Invalid charset-list");
9963 charset_list = Viso_2022_charset_list;
9965 else if (EQ (charset_list, Qemacs_mule))
9967 if (! EQ (coding_type, Qemacs_mule))
9968 error ("Invalid charset-list");
9969 charset_list = Vemacs_mule_charset_list;
9971 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9973 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9974 error ("Invalid charset-list");
9975 if (max_charset_id < XFASTINT (XCAR (tail)))
9976 max_charset_id = XFASTINT (XCAR (tail));
9979 else
9981 charset_list = Fcopy_sequence (charset_list);
9982 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9984 struct charset *charset;
9986 val = XCAR (tail);
9987 CHECK_CHARSET_GET_CHARSET (val, charset);
9988 if (EQ (coding_type, Qiso_2022)
9989 ? CHARSET_ISO_FINAL (charset) < 0
9990 : EQ (coding_type, Qemacs_mule)
9991 ? CHARSET_EMACS_MULE_ID (charset) < 0
9992 : 0)
9993 error ("Can't handle charset `%s'",
9994 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9996 XSETCAR (tail, make_number (charset->id));
9997 if (max_charset_id < charset->id)
9998 max_charset_id = charset->id;
10001 ASET (attrs, coding_attr_charset_list, charset_list);
10003 safe_charsets = make_uninit_string (max_charset_id + 1);
10004 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10005 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10006 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10007 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10009 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10011 val = args[coding_arg_decode_translation_table];
10012 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10013 CHECK_SYMBOL (val);
10014 ASET (attrs, coding_attr_decode_tbl, val);
10016 val = args[coding_arg_encode_translation_table];
10017 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10018 CHECK_SYMBOL (val);
10019 ASET (attrs, coding_attr_encode_tbl, val);
10021 val = args[coding_arg_post_read_conversion];
10022 CHECK_SYMBOL (val);
10023 ASET (attrs, coding_attr_post_read, val);
10025 val = args[coding_arg_pre_write_conversion];
10026 CHECK_SYMBOL (val);
10027 ASET (attrs, coding_attr_pre_write, val);
10029 val = args[coding_arg_default_char];
10030 if (NILP (val))
10031 ASET (attrs, coding_attr_default_char, make_number (' '));
10032 else
10034 CHECK_CHARACTER (val);
10035 ASET (attrs, coding_attr_default_char, val);
10038 val = args[coding_arg_for_unibyte];
10039 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10041 val = args[coding_arg_plist];
10042 CHECK_LIST (val);
10043 ASET (attrs, coding_attr_plist, val);
10045 if (EQ (coding_type, Qcharset))
10047 /* Generate a lisp vector of 256 elements. Each element is nil,
10048 integer, or a list of charset IDs.
10050 If Nth element is nil, the byte code N is invalid in this
10051 coding system.
10053 If Nth element is a number NUM, N is the first byte of a
10054 charset whose ID is NUM.
10056 If Nth element is a list of charset IDs, N is the first byte
10057 of one of them. The list is sorted by dimensions of the
10058 charsets. A charset of smaller dimension comes first. */
10059 val = Fmake_vector (make_number (256), Qnil);
10061 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10063 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10064 int dim = CHARSET_DIMENSION (charset);
10065 int idx = (dim - 1) * 4;
10067 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10068 ASET (attrs, coding_attr_ascii_compat, Qt);
10070 for (i = charset->code_space[idx];
10071 i <= charset->code_space[idx + 1]; i++)
10073 Lisp_Object tmp, tmp2;
10074 int dim2;
10076 tmp = AREF (val, i);
10077 if (NILP (tmp))
10078 tmp = XCAR (tail);
10079 else if (NUMBERP (tmp))
10081 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10082 if (dim < dim2)
10083 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10084 else
10085 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10087 else
10089 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10091 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10092 if (dim < dim2)
10093 break;
10095 if (NILP (tmp2))
10096 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10097 else
10099 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10100 XSETCAR (tmp2, XCAR (tail));
10103 ASET (val, i, tmp);
10106 ASET (attrs, coding_attr_charset_valids, val);
10107 category = coding_category_charset;
10109 else if (EQ (coding_type, Qccl))
10111 Lisp_Object valids;
10113 if (nargs < coding_arg_ccl_max)
10114 goto short_args;
10116 val = args[coding_arg_ccl_decoder];
10117 CHECK_CCL_PROGRAM (val);
10118 if (VECTORP (val))
10119 val = Fcopy_sequence (val);
10120 ASET (attrs, coding_attr_ccl_decoder, val);
10122 val = args[coding_arg_ccl_encoder];
10123 CHECK_CCL_PROGRAM (val);
10124 if (VECTORP (val))
10125 val = Fcopy_sequence (val);
10126 ASET (attrs, coding_attr_ccl_encoder, val);
10128 val = args[coding_arg_ccl_valids];
10129 valids = Fmake_string (make_number (256), make_number (0));
10130 for (tail = val; CONSP (tail); tail = XCDR (tail))
10132 int from, to;
10134 val = XCAR (tail);
10135 if (INTEGERP (val))
10137 if (! (0 <= XINT (val) && XINT (val) <= 255))
10138 args_out_of_range_3 (val, make_number (0), make_number (255));
10139 from = to = XINT (val);
10141 else
10143 CHECK_CONS (val);
10144 CHECK_NATNUM_CAR (val);
10145 CHECK_NUMBER_CDR (val);
10146 if (XINT (XCAR (val)) > 255)
10147 args_out_of_range_3 (XCAR (val),
10148 make_number (0), make_number (255));
10149 from = XINT (XCAR (val));
10150 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10151 args_out_of_range_3 (XCDR (val),
10152 XCAR (val), make_number (255));
10153 to = XINT (XCDR (val));
10155 for (i = from; i <= to; i++)
10156 SSET (valids, i, 1);
10158 ASET (attrs, coding_attr_ccl_valids, valids);
10160 category = coding_category_ccl;
10162 else if (EQ (coding_type, Qutf_16))
10164 Lisp_Object bom, endian;
10166 ASET (attrs, coding_attr_ascii_compat, Qnil);
10168 if (nargs < coding_arg_utf16_max)
10169 goto short_args;
10171 bom = args[coding_arg_utf16_bom];
10172 if (! NILP (bom) && ! EQ (bom, Qt))
10174 CHECK_CONS (bom);
10175 val = XCAR (bom);
10176 CHECK_CODING_SYSTEM (val);
10177 val = XCDR (bom);
10178 CHECK_CODING_SYSTEM (val);
10180 ASET (attrs, coding_attr_utf_bom, bom);
10182 endian = args[coding_arg_utf16_endian];
10183 CHECK_SYMBOL (endian);
10184 if (NILP (endian))
10185 endian = Qbig;
10186 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10187 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10188 ASET (attrs, coding_attr_utf_16_endian, endian);
10190 category = (CONSP (bom)
10191 ? coding_category_utf_16_auto
10192 : NILP (bom)
10193 ? (EQ (endian, Qbig)
10194 ? coding_category_utf_16_be_nosig
10195 : coding_category_utf_16_le_nosig)
10196 : (EQ (endian, Qbig)
10197 ? coding_category_utf_16_be
10198 : coding_category_utf_16_le));
10200 else if (EQ (coding_type, Qiso_2022))
10202 Lisp_Object initial, reg_usage, request, flags;
10204 if (nargs < coding_arg_iso2022_max)
10205 goto short_args;
10207 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10208 CHECK_VECTOR (initial);
10209 for (i = 0; i < 4; i++)
10211 val = AREF (initial, i);
10212 if (! NILP (val))
10214 struct charset *charset;
10216 CHECK_CHARSET_GET_CHARSET (val, charset);
10217 ASET (initial, i, make_number (CHARSET_ID (charset)));
10218 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10219 ASET (attrs, coding_attr_ascii_compat, Qt);
10221 else
10222 ASET (initial, i, make_number (-1));
10225 reg_usage = args[coding_arg_iso2022_reg_usage];
10226 CHECK_CONS (reg_usage);
10227 CHECK_NUMBER_CAR (reg_usage);
10228 CHECK_NUMBER_CDR (reg_usage);
10230 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10231 for (tail = request; CONSP (tail); tail = XCDR (tail))
10233 int id;
10234 Lisp_Object tmp1;
10236 val = XCAR (tail);
10237 CHECK_CONS (val);
10238 tmp1 = XCAR (val);
10239 CHECK_CHARSET_GET_ID (tmp1, id);
10240 CHECK_NATNUM_CDR (val);
10241 if (XINT (XCDR (val)) >= 4)
10242 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10243 XSETCAR (val, make_number (id));
10246 flags = args[coding_arg_iso2022_flags];
10247 CHECK_NATNUM (flags);
10248 i = XINT (flags) & INT_MAX;
10249 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10250 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10251 flags = make_number (i);
10253 ASET (attrs, coding_attr_iso_initial, initial);
10254 ASET (attrs, coding_attr_iso_usage, reg_usage);
10255 ASET (attrs, coding_attr_iso_request, request);
10256 ASET (attrs, coding_attr_iso_flags, flags);
10257 setup_iso_safe_charsets (attrs);
10259 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10260 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10261 | CODING_ISO_FLAG_SINGLE_SHIFT))
10262 ? coding_category_iso_7_else
10263 : EQ (args[coding_arg_charset_list], Qiso_2022)
10264 ? coding_category_iso_7
10265 : coding_category_iso_7_tight);
10266 else
10268 int id = XINT (AREF (initial, 1));
10270 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10271 || EQ (args[coding_arg_charset_list], Qiso_2022)
10272 || id < 0)
10273 ? coding_category_iso_8_else
10274 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10275 ? coding_category_iso_8_1
10276 : coding_category_iso_8_2);
10278 if (category != coding_category_iso_8_1
10279 && category != coding_category_iso_8_2)
10280 ASET (attrs, coding_attr_ascii_compat, Qnil);
10282 else if (EQ (coding_type, Qemacs_mule))
10284 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10285 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10286 ASET (attrs, coding_attr_ascii_compat, Qt);
10287 category = coding_category_emacs_mule;
10289 else if (EQ (coding_type, Qshift_jis))
10292 struct charset *charset;
10294 if (XINT (Flength (charset_list)) != 3
10295 && XINT (Flength (charset_list)) != 4)
10296 error ("There should be three or four charsets");
10298 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10299 if (CHARSET_DIMENSION (charset) != 1)
10300 error ("Dimension of charset %s is not one",
10301 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10302 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10303 ASET (attrs, coding_attr_ascii_compat, Qt);
10305 charset_list = XCDR (charset_list);
10306 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10307 if (CHARSET_DIMENSION (charset) != 1)
10308 error ("Dimension of charset %s is not one",
10309 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10311 charset_list = XCDR (charset_list);
10312 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10313 if (CHARSET_DIMENSION (charset) != 2)
10314 error ("Dimension of charset %s is not two",
10315 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10317 charset_list = XCDR (charset_list);
10318 if (! NILP (charset_list))
10320 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10321 if (CHARSET_DIMENSION (charset) != 2)
10322 error ("Dimension of charset %s is not two",
10323 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10326 category = coding_category_sjis;
10327 Vsjis_coding_system = name;
10329 else if (EQ (coding_type, Qbig5))
10331 struct charset *charset;
10333 if (XINT (Flength (charset_list)) != 2)
10334 error ("There should be just two charsets");
10336 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10337 if (CHARSET_DIMENSION (charset) != 1)
10338 error ("Dimension of charset %s is not one",
10339 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10340 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10341 ASET (attrs, coding_attr_ascii_compat, Qt);
10343 charset_list = XCDR (charset_list);
10344 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10345 if (CHARSET_DIMENSION (charset) != 2)
10346 error ("Dimension of charset %s is not two",
10347 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10349 category = coding_category_big5;
10350 Vbig5_coding_system = name;
10352 else if (EQ (coding_type, Qraw_text))
10354 category = coding_category_raw_text;
10355 ASET (attrs, coding_attr_ascii_compat, Qt);
10357 else if (EQ (coding_type, Qutf_8))
10359 Lisp_Object bom;
10361 if (nargs < coding_arg_utf8_max)
10362 goto short_args;
10364 bom = args[coding_arg_utf8_bom];
10365 if (! NILP (bom) && ! EQ (bom, Qt))
10367 CHECK_CONS (bom);
10368 val = XCAR (bom);
10369 CHECK_CODING_SYSTEM (val);
10370 val = XCDR (bom);
10371 CHECK_CODING_SYSTEM (val);
10373 ASET (attrs, coding_attr_utf_bom, bom);
10374 if (NILP (bom))
10375 ASET (attrs, coding_attr_ascii_compat, Qt);
10377 category = (CONSP (bom) ? coding_category_utf_8_auto
10378 : NILP (bom) ? coding_category_utf_8_nosig
10379 : coding_category_utf_8_sig);
10381 else if (EQ (coding_type, Qundecided))
10383 if (nargs < coding_arg_undecided_max)
10384 goto short_args;
10385 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10386 args[coding_arg_undecided_inhibit_null_byte_detection]);
10387 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10388 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10389 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10390 args[coding_arg_undecided_prefer_utf_8]);
10391 category = coding_category_undecided;
10393 else
10394 error ("Invalid coding system type: %s",
10395 SDATA (SYMBOL_NAME (coding_type)));
10397 ASET (attrs, coding_attr_category, make_number (category));
10398 ASET (attrs, coding_attr_plist,
10399 Fcons (QCcategory,
10400 Fcons (AREF (Vcoding_category_table, category),
10401 CODING_ATTR_PLIST (attrs))));
10402 ASET (attrs, coding_attr_plist,
10403 Fcons (QCascii_compatible_p,
10404 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10405 CODING_ATTR_PLIST (attrs))));
10407 eol_type = args[coding_arg_eol_type];
10408 if (! NILP (eol_type)
10409 && ! EQ (eol_type, Qunix)
10410 && ! EQ (eol_type, Qdos)
10411 && ! EQ (eol_type, Qmac))
10412 error ("Invalid eol-type");
10414 aliases = Fcons (name, Qnil);
10416 if (NILP (eol_type))
10418 eol_type = make_subsidiaries (name);
10419 for (i = 0; i < 3; i++)
10421 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10423 this_name = AREF (eol_type, i);
10424 this_aliases = Fcons (this_name, Qnil);
10425 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10426 this_spec = make_uninit_vector (3);
10427 ASET (this_spec, 0, attrs);
10428 ASET (this_spec, 1, this_aliases);
10429 ASET (this_spec, 2, this_eol_type);
10430 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10431 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10432 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10433 if (NILP (val))
10434 Vcoding_system_alist
10435 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10436 Vcoding_system_alist);
10440 spec_vec = make_uninit_vector (3);
10441 ASET (spec_vec, 0, attrs);
10442 ASET (spec_vec, 1, aliases);
10443 ASET (spec_vec, 2, eol_type);
10445 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10446 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10447 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10448 if (NILP (val))
10449 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10450 Vcoding_system_alist);
10453 int id = coding_categories[category].id;
10455 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10456 setup_coding_system (name, &coding_categories[category]);
10459 return Qnil;
10461 short_args:
10462 return Fsignal (Qwrong_number_of_arguments,
10463 Fcons (intern ("define-coding-system-internal"),
10464 make_number (nargs)));
10468 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10469 3, 3, 0,
10470 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10471 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10473 Lisp_Object spec, attrs;
10475 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10476 attrs = AREF (spec, 0);
10477 if (EQ (prop, QCmnemonic))
10479 if (! STRINGP (val))
10480 CHECK_CHARACTER (val);
10481 ASET (attrs, coding_attr_mnemonic, val);
10483 else if (EQ (prop, QCdefault_char))
10485 if (NILP (val))
10486 val = make_number (' ');
10487 else
10488 CHECK_CHARACTER (val);
10489 ASET (attrs, coding_attr_default_char, val);
10491 else if (EQ (prop, QCdecode_translation_table))
10493 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10494 CHECK_SYMBOL (val);
10495 ASET (attrs, coding_attr_decode_tbl, val);
10497 else if (EQ (prop, QCencode_translation_table))
10499 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10500 CHECK_SYMBOL (val);
10501 ASET (attrs, coding_attr_encode_tbl, val);
10503 else if (EQ (prop, QCpost_read_conversion))
10505 CHECK_SYMBOL (val);
10506 ASET (attrs, coding_attr_post_read, val);
10508 else if (EQ (prop, QCpre_write_conversion))
10510 CHECK_SYMBOL (val);
10511 ASET (attrs, coding_attr_pre_write, val);
10513 else if (EQ (prop, QCascii_compatible_p))
10515 ASET (attrs, coding_attr_ascii_compat, val);
10518 ASET (attrs, coding_attr_plist,
10519 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10520 return val;
10524 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10525 Sdefine_coding_system_alias, 2, 2, 0,
10526 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10527 (Lisp_Object alias, Lisp_Object coding_system)
10529 Lisp_Object spec, aliases, eol_type, val;
10531 CHECK_SYMBOL (alias);
10532 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10533 aliases = AREF (spec, 1);
10534 /* ALIASES should be a list of length more than zero, and the first
10535 element is a base coding system. Append ALIAS at the tail of the
10536 list. */
10537 while (!NILP (XCDR (aliases)))
10538 aliases = XCDR (aliases);
10539 XSETCDR (aliases, Fcons (alias, Qnil));
10541 eol_type = AREF (spec, 2);
10542 if (VECTORP (eol_type))
10544 Lisp_Object subsidiaries;
10545 int i;
10547 subsidiaries = make_subsidiaries (alias);
10548 for (i = 0; i < 3; i++)
10549 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10550 AREF (eol_type, i));
10553 Fputhash (alias, spec, Vcoding_system_hash_table);
10554 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10555 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10556 if (NILP (val))
10557 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10558 Vcoding_system_alist);
10560 return Qnil;
10563 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10564 1, 1, 0,
10565 doc: /* Return the base of CODING-SYSTEM.
10566 Any alias or subsidiary coding system is not a base coding system. */)
10567 (Lisp_Object coding_system)
10569 Lisp_Object spec, attrs;
10571 if (NILP (coding_system))
10572 return (Qno_conversion);
10573 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10574 attrs = AREF (spec, 0);
10575 return CODING_ATTR_BASE_NAME (attrs);
10578 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10579 1, 1, 0,
10580 doc: "Return the property list of CODING-SYSTEM.")
10581 (Lisp_Object coding_system)
10583 Lisp_Object spec, attrs;
10585 if (NILP (coding_system))
10586 coding_system = Qno_conversion;
10587 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10588 attrs = AREF (spec, 0);
10589 return CODING_ATTR_PLIST (attrs);
10593 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10594 1, 1, 0,
10595 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10596 (Lisp_Object coding_system)
10598 Lisp_Object spec;
10600 if (NILP (coding_system))
10601 coding_system = Qno_conversion;
10602 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10603 return AREF (spec, 1);
10606 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10607 Scoding_system_eol_type, 1, 1, 0,
10608 doc: /* Return eol-type of CODING-SYSTEM.
10609 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10611 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10612 and CR respectively.
10614 A vector value indicates that a format of end-of-line should be
10615 detected automatically. Nth element of the vector is the subsidiary
10616 coding system whose eol-type is N. */)
10617 (Lisp_Object coding_system)
10619 Lisp_Object spec, eol_type;
10620 int n;
10622 if (NILP (coding_system))
10623 coding_system = Qno_conversion;
10624 if (! CODING_SYSTEM_P (coding_system))
10625 return Qnil;
10626 spec = CODING_SYSTEM_SPEC (coding_system);
10627 eol_type = AREF (spec, 2);
10628 if (VECTORP (eol_type))
10629 return Fcopy_sequence (eol_type);
10630 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10631 return make_number (n);
10634 #endif /* emacs */
10637 /*** 9. Post-amble ***/
10639 void
10640 init_coding_once (void)
10642 int i;
10644 for (i = 0; i < coding_category_max; i++)
10646 coding_categories[i].id = -1;
10647 coding_priorities[i] = i;
10650 /* ISO2022 specific initialize routine. */
10651 for (i = 0; i < 0x20; i++)
10652 iso_code_class[i] = ISO_control_0;
10653 for (i = 0x21; i < 0x7F; i++)
10654 iso_code_class[i] = ISO_graphic_plane_0;
10655 for (i = 0x80; i < 0xA0; i++)
10656 iso_code_class[i] = ISO_control_1;
10657 for (i = 0xA1; i < 0xFF; i++)
10658 iso_code_class[i] = ISO_graphic_plane_1;
10659 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10660 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10661 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10662 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10663 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10664 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10665 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10666 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10667 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10669 for (i = 0; i < 256; i++)
10671 emacs_mule_bytes[i] = 1;
10673 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10674 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10675 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10676 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10679 #ifdef emacs
10681 void
10682 syms_of_coding (void)
10684 staticpro (&Vcoding_system_hash_table);
10686 Lisp_Object args[2];
10687 args[0] = QCtest;
10688 args[1] = Qeq;
10689 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10692 staticpro (&Vsjis_coding_system);
10693 Vsjis_coding_system = Qnil;
10695 staticpro (&Vbig5_coding_system);
10696 Vbig5_coding_system = Qnil;
10698 staticpro (&Vcode_conversion_reused_workbuf);
10699 Vcode_conversion_reused_workbuf = Qnil;
10701 staticpro (&Vcode_conversion_workbuf_name);
10702 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10704 reused_workbuf_in_use = 0;
10706 DEFSYM (Qcharset, "charset");
10707 DEFSYM (Qtarget_idx, "target-idx");
10708 DEFSYM (Qcoding_system_history, "coding-system-history");
10709 Fset (Qcoding_system_history, Qnil);
10711 /* Target FILENAME is the first argument. */
10712 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10713 /* Target FILENAME is the third argument. */
10714 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10716 DEFSYM (Qcall_process, "call-process");
10717 /* Target PROGRAM is the first argument. */
10718 Fput (Qcall_process, Qtarget_idx, make_number (0));
10720 DEFSYM (Qcall_process_region, "call-process-region");
10721 /* Target PROGRAM is the third argument. */
10722 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10724 DEFSYM (Qstart_process, "start-process");
10725 /* Target PROGRAM is the third argument. */
10726 Fput (Qstart_process, Qtarget_idx, make_number (2));
10728 DEFSYM (Qopen_network_stream, "open-network-stream");
10729 /* Target SERVICE is the fourth argument. */
10730 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10732 DEFSYM (Qcoding_system, "coding-system");
10733 DEFSYM (Qcoding_aliases, "coding-aliases");
10735 DEFSYM (Qeol_type, "eol-type");
10736 DEFSYM (Qunix, "unix");
10737 DEFSYM (Qdos, "dos");
10738 DEFSYM (Qmac, "mac");
10740 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10741 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10742 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10743 DEFSYM (Qdefault_char, "default-char");
10744 DEFSYM (Qundecided, "undecided");
10745 DEFSYM (Qno_conversion, "no-conversion");
10746 DEFSYM (Qraw_text, "raw-text");
10748 DEFSYM (Qiso_2022, "iso-2022");
10750 DEFSYM (Qutf_8, "utf-8");
10751 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10753 #if defined (WINDOWSNT) || defined (CYGWIN)
10754 /* No, not utf-16-le: that one has a BOM. */
10755 DEFSYM (Qutf_16le, "utf-16le");
10756 #endif
10758 DEFSYM (Qutf_16, "utf-16");
10759 DEFSYM (Qbig, "big");
10760 DEFSYM (Qlittle, "little");
10762 DEFSYM (Qshift_jis, "shift-jis");
10763 DEFSYM (Qbig5, "big5");
10765 DEFSYM (Qcoding_system_p, "coding-system-p");
10767 DEFSYM (Qcoding_system_error, "coding-system-error");
10768 Fput (Qcoding_system_error, Qerror_conditions,
10769 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10770 Fput (Qcoding_system_error, Qerror_message,
10771 build_pure_c_string ("Invalid coding system"));
10773 /* Intern this now in case it isn't already done.
10774 Setting this variable twice is harmless.
10775 But don't staticpro it here--that is done in alloc.c. */
10776 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10778 DEFSYM (Qtranslation_table, "translation-table");
10779 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10780 DEFSYM (Qtranslation_table_id, "translation-table-id");
10781 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10782 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10784 DEFSYM (Qvalid_codes, "valid-codes");
10786 DEFSYM (Qemacs_mule, "emacs-mule");
10788 DEFSYM (QCcategory, ":category");
10789 DEFSYM (QCmnemonic, ":mnemonic");
10790 DEFSYM (QCdefault_char, ":default-char");
10791 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10792 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10793 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10794 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10795 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10797 Vcoding_category_table
10798 = Fmake_vector (make_number (coding_category_max), Qnil);
10799 staticpro (&Vcoding_category_table);
10800 /* Followings are target of code detection. */
10801 ASET (Vcoding_category_table, coding_category_iso_7,
10802 intern_c_string ("coding-category-iso-7"));
10803 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10804 intern_c_string ("coding-category-iso-7-tight"));
10805 ASET (Vcoding_category_table, coding_category_iso_8_1,
10806 intern_c_string ("coding-category-iso-8-1"));
10807 ASET (Vcoding_category_table, coding_category_iso_8_2,
10808 intern_c_string ("coding-category-iso-8-2"));
10809 ASET (Vcoding_category_table, coding_category_iso_7_else,
10810 intern_c_string ("coding-category-iso-7-else"));
10811 ASET (Vcoding_category_table, coding_category_iso_8_else,
10812 intern_c_string ("coding-category-iso-8-else"));
10813 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10814 intern_c_string ("coding-category-utf-8-auto"));
10815 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10816 intern_c_string ("coding-category-utf-8"));
10817 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10818 intern_c_string ("coding-category-utf-8-sig"));
10819 ASET (Vcoding_category_table, coding_category_utf_16_be,
10820 intern_c_string ("coding-category-utf-16-be"));
10821 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10822 intern_c_string ("coding-category-utf-16-auto"));
10823 ASET (Vcoding_category_table, coding_category_utf_16_le,
10824 intern_c_string ("coding-category-utf-16-le"));
10825 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10826 intern_c_string ("coding-category-utf-16-be-nosig"));
10827 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10828 intern_c_string ("coding-category-utf-16-le-nosig"));
10829 ASET (Vcoding_category_table, coding_category_charset,
10830 intern_c_string ("coding-category-charset"));
10831 ASET (Vcoding_category_table, coding_category_sjis,
10832 intern_c_string ("coding-category-sjis"));
10833 ASET (Vcoding_category_table, coding_category_big5,
10834 intern_c_string ("coding-category-big5"));
10835 ASET (Vcoding_category_table, coding_category_ccl,
10836 intern_c_string ("coding-category-ccl"));
10837 ASET (Vcoding_category_table, coding_category_emacs_mule,
10838 intern_c_string ("coding-category-emacs-mule"));
10839 /* Followings are NOT target of code detection. */
10840 ASET (Vcoding_category_table, coding_category_raw_text,
10841 intern_c_string ("coding-category-raw-text"));
10842 ASET (Vcoding_category_table, coding_category_undecided,
10843 intern_c_string ("coding-category-undecided"));
10845 DEFSYM (Qinsufficient_source, "insufficient-source");
10846 DEFSYM (Qinvalid_source, "invalid-source");
10847 DEFSYM (Qinterrupted, "interrupted");
10848 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10850 defsubr (&Scoding_system_p);
10851 defsubr (&Sread_coding_system);
10852 defsubr (&Sread_non_nil_coding_system);
10853 defsubr (&Scheck_coding_system);
10854 defsubr (&Sdetect_coding_region);
10855 defsubr (&Sdetect_coding_string);
10856 defsubr (&Sfind_coding_systems_region_internal);
10857 defsubr (&Sunencodable_char_position);
10858 defsubr (&Scheck_coding_systems_region);
10859 defsubr (&Sdecode_coding_region);
10860 defsubr (&Sencode_coding_region);
10861 defsubr (&Sdecode_coding_string);
10862 defsubr (&Sencode_coding_string);
10863 defsubr (&Sdecode_sjis_char);
10864 defsubr (&Sencode_sjis_char);
10865 defsubr (&Sdecode_big5_char);
10866 defsubr (&Sencode_big5_char);
10867 defsubr (&Sset_terminal_coding_system_internal);
10868 defsubr (&Sset_safe_terminal_coding_system_internal);
10869 defsubr (&Sterminal_coding_system);
10870 defsubr (&Sset_keyboard_coding_system_internal);
10871 defsubr (&Skeyboard_coding_system);
10872 defsubr (&Sfind_operation_coding_system);
10873 defsubr (&Sset_coding_system_priority);
10874 defsubr (&Sdefine_coding_system_internal);
10875 defsubr (&Sdefine_coding_system_alias);
10876 defsubr (&Scoding_system_put);
10877 defsubr (&Scoding_system_base);
10878 defsubr (&Scoding_system_plist);
10879 defsubr (&Scoding_system_aliases);
10880 defsubr (&Scoding_system_eol_type);
10881 defsubr (&Scoding_system_priority_list);
10883 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10884 doc: /* List of coding systems.
10886 Do not alter the value of this variable manually. This variable should be
10887 updated by the functions `define-coding-system' and
10888 `define-coding-system-alias'. */);
10889 Vcoding_system_list = Qnil;
10891 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10892 doc: /* Alist of coding system names.
10893 Each element is one element list of coding system name.
10894 This variable is given to `completing-read' as COLLECTION argument.
10896 Do not alter the value of this variable manually. This variable should be
10897 updated by the functions `make-coding-system' and
10898 `define-coding-system-alias'. */);
10899 Vcoding_system_alist = Qnil;
10901 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10902 doc: /* List of coding-categories (symbols) ordered by priority.
10904 On detecting a coding system, Emacs tries code detection algorithms
10905 associated with each coding-category one by one in this order. When
10906 one algorithm agrees with a byte sequence of source text, the coding
10907 system bound to the corresponding coding-category is selected.
10909 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10911 int i;
10913 Vcoding_category_list = Qnil;
10914 for (i = coding_category_max - 1; i >= 0; i--)
10915 Vcoding_category_list
10916 = Fcons (AREF (Vcoding_category_table, i),
10917 Vcoding_category_list);
10920 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10921 doc: /* Specify the coding system for read operations.
10922 It is useful to bind this variable with `let', but do not set it globally.
10923 If the value is a coding system, it is used for decoding on read operation.
10924 If not, an appropriate element is used from one of the coding system alists.
10925 There are three such tables: `file-coding-system-alist',
10926 `process-coding-system-alist', and `network-coding-system-alist'. */);
10927 Vcoding_system_for_read = Qnil;
10929 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10930 doc: /* Specify the coding system for write operations.
10931 Programs bind this variable with `let', but you should not set it globally.
10932 If the value is a coding system, it is used for encoding of output,
10933 when writing it to a file and when sending it to a file or subprocess.
10935 If this does not specify a coding system, an appropriate element
10936 is used from one of the coding system alists.
10937 There are three such tables: `file-coding-system-alist',
10938 `process-coding-system-alist', and `network-coding-system-alist'.
10939 For output to files, if the above procedure does not specify a coding system,
10940 the value of `buffer-file-coding-system' is used. */);
10941 Vcoding_system_for_write = Qnil;
10943 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10944 doc: /*
10945 Coding system used in the latest file or process I/O. */);
10946 Vlast_coding_system_used = Qnil;
10948 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10949 doc: /*
10950 Error status of the last code conversion.
10952 When an error was detected in the last code conversion, this variable
10953 is set to one of the following symbols.
10954 `insufficient-source'
10955 `inconsistent-eol'
10956 `invalid-source'
10957 `interrupted'
10958 `insufficient-memory'
10959 When no error was detected, the value doesn't change. So, to check
10960 the error status of a code conversion by this variable, you must
10961 explicitly set this variable to nil before performing code
10962 conversion. */);
10963 Vlast_code_conversion_error = Qnil;
10965 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10966 doc: /*
10967 *Non-nil means always inhibit code conversion of end-of-line format.
10968 See info node `Coding Systems' and info node `Text and Binary' concerning
10969 such conversion. */);
10970 inhibit_eol_conversion = 0;
10972 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10973 doc: /*
10974 Non-nil means process buffer inherits coding system of process output.
10975 Bind it to t if the process output is to be treated as if it were a file
10976 read from some filesystem. */);
10977 inherit_process_coding_system = 0;
10979 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10980 doc: /*
10981 Alist to decide a coding system to use for a file I/O operation.
10982 The format is ((PATTERN . VAL) ...),
10983 where PATTERN is a regular expression matching a file name,
10984 VAL is a coding system, a cons of coding systems, or a function symbol.
10985 If VAL is a coding system, it is used for both decoding and encoding
10986 the file contents.
10987 If VAL is a cons of coding systems, the car part is used for decoding,
10988 and the cdr part is used for encoding.
10989 If VAL is a function symbol, the function must return a coding system
10990 or a cons of coding systems which are used as above. The function is
10991 called with an argument that is a list of the arguments with which
10992 `find-operation-coding-system' was called. If the function can't decide
10993 a coding system, it can return `undecided' so that the normal
10994 code-detection is performed.
10996 See also the function `find-operation-coding-system'
10997 and the variable `auto-coding-alist'. */);
10998 Vfile_coding_system_alist = Qnil;
11000 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11001 doc: /*
11002 Alist to decide a coding system to use for a process I/O operation.
11003 The format is ((PATTERN . VAL) ...),
11004 where PATTERN is a regular expression matching a program name,
11005 VAL is a coding system, a cons of coding systems, or a function symbol.
11006 If VAL is a coding system, it is used for both decoding what received
11007 from the program and encoding what sent to the program.
11008 If VAL is a cons of coding systems, the car part is used for decoding,
11009 and the cdr part is used for encoding.
11010 If VAL is a function symbol, the function must return a coding system
11011 or a cons of coding systems which are used as above.
11013 See also the function `find-operation-coding-system'. */);
11014 Vprocess_coding_system_alist = Qnil;
11016 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11017 doc: /*
11018 Alist to decide a coding system to use for a network I/O operation.
11019 The format is ((PATTERN . VAL) ...),
11020 where PATTERN is a regular expression matching a network service name
11021 or is a port number to connect to,
11022 VAL is a coding system, a cons of coding systems, or a function symbol.
11023 If VAL is a coding system, it is used for both decoding what received
11024 from the network stream and encoding what sent to the network stream.
11025 If VAL is a cons of coding systems, the car part is used for decoding,
11026 and the cdr part is used for encoding.
11027 If VAL is a function symbol, the function must return a coding system
11028 or a cons of coding systems which are used as above.
11030 See also the function `find-operation-coding-system'. */);
11031 Vnetwork_coding_system_alist = Qnil;
11033 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11034 doc: /* Coding system to use with system messages.
11035 Also used for decoding keyboard input on X Window system. */);
11036 Vlocale_coding_system = Qnil;
11038 /* The eol mnemonics are reset in startup.el system-dependently. */
11039 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11040 doc: /*
11041 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11042 eol_mnemonic_unix = build_pure_c_string (":");
11044 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11045 doc: /*
11046 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11047 eol_mnemonic_dos = build_pure_c_string ("\\");
11049 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11050 doc: /*
11051 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11052 eol_mnemonic_mac = build_pure_c_string ("/");
11054 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11055 doc: /*
11056 *String displayed in mode line when end-of-line format is not yet determined. */);
11057 eol_mnemonic_undecided = build_pure_c_string (":");
11059 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11060 doc: /*
11061 *Non-nil enables character translation while encoding and decoding. */);
11062 Venable_character_translation = Qt;
11064 DEFVAR_LISP ("standard-translation-table-for-decode",
11065 Vstandard_translation_table_for_decode,
11066 doc: /* Table for translating characters while decoding. */);
11067 Vstandard_translation_table_for_decode = Qnil;
11069 DEFVAR_LISP ("standard-translation-table-for-encode",
11070 Vstandard_translation_table_for_encode,
11071 doc: /* Table for translating characters while encoding. */);
11072 Vstandard_translation_table_for_encode = Qnil;
11074 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11075 doc: /* Alist of charsets vs revision numbers.
11076 While encoding, if a charset (car part of an element) is found,
11077 designate it with the escape sequence identifying revision (cdr part
11078 of the element). */);
11079 Vcharset_revision_table = Qnil;
11081 DEFVAR_LISP ("default-process-coding-system",
11082 Vdefault_process_coding_system,
11083 doc: /* Cons of coding systems used for process I/O by default.
11084 The car part is used for decoding a process output,
11085 the cdr part is used for encoding a text to be sent to a process. */);
11086 Vdefault_process_coding_system = Qnil;
11088 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11089 doc: /*
11090 Table of extra Latin codes in the range 128..159 (inclusive).
11091 This is a vector of length 256.
11092 If Nth element is non-nil, the existence of code N in a file
11093 \(or output of subprocess) doesn't prevent it to be detected as
11094 a coding system of ISO 2022 variant which has a flag
11095 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11096 or reading output of a subprocess.
11097 Only 128th through 159th elements have a meaning. */);
11098 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11100 DEFVAR_LISP ("select-safe-coding-system-function",
11101 Vselect_safe_coding_system_function,
11102 doc: /*
11103 Function to call to select safe coding system for encoding a text.
11105 If set, this function is called to force a user to select a proper
11106 coding system which can encode the text in the case that a default
11107 coding system used in each operation can't encode the text. The
11108 function should take care that the buffer is not modified while
11109 the coding system is being selected.
11111 The default value is `select-safe-coding-system' (which see). */);
11112 Vselect_safe_coding_system_function = Qnil;
11114 DEFVAR_BOOL ("coding-system-require-warning",
11115 coding_system_require_warning,
11116 doc: /* Internal use only.
11117 If non-nil, on writing a file, `select-safe-coding-system-function' is
11118 called even if `coding-system-for-write' is non-nil. The command
11119 `universal-coding-system-argument' binds this variable to t temporarily. */);
11120 coding_system_require_warning = 0;
11123 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11124 inhibit_iso_escape_detection,
11125 doc: /*
11126 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11128 When Emacs reads text, it tries to detect how the text is encoded.
11129 This code detection is sensitive to escape sequences. If Emacs sees
11130 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11131 of the ISO2022 encodings, and decodes text by the corresponding coding
11132 system (e.g. `iso-2022-7bit').
11134 However, there may be a case that you want to read escape sequences in
11135 a file as is. In such a case, you can set this variable to non-nil.
11136 Then the code detection will ignore any escape sequences, and no text is
11137 detected as encoded in some ISO-2022 encoding. The result is that all
11138 escape sequences become visible in a buffer.
11140 The default value is nil, and it is strongly recommended not to change
11141 it. That is because many Emacs Lisp source files that contain
11142 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11143 in Emacs's distribution, and they won't be decoded correctly on
11144 reading if you suppress escape sequence detection.
11146 The other way to read escape sequences in a file without decoding is
11147 to explicitly specify some coding system that doesn't use ISO-2022
11148 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11149 inhibit_iso_escape_detection = 0;
11151 DEFVAR_BOOL ("inhibit-null-byte-detection",
11152 inhibit_null_byte_detection,
11153 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11154 By default, Emacs treats it as binary data, and does not attempt to
11155 decode it. The effect is as if you specified `no-conversion' for
11156 reading that text.
11158 Set this to non-nil when a regular text happens to include null bytes.
11159 Examples are Index nodes of Info files and null-byte delimited output
11160 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11161 decode text as usual. */);
11162 inhibit_null_byte_detection = 0;
11164 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11165 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11166 Internal use only. Removed after the experimental optimizer gets stable. */);
11167 disable_ascii_optimization = 0;
11169 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11170 doc: /* Char table for translating self-inserting characters.
11171 This is applied to the result of input methods, not their input.
11172 See also `keyboard-translate-table'.
11174 Use of this variable for character code unification was rendered
11175 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11176 internal character representation. */);
11177 Vtranslation_table_for_input = Qnil;
11180 Lisp_Object args[coding_arg_undecided_max];
11181 Lisp_Object plist[16];
11182 int i;
11184 for (i = 0; i < coding_arg_undecided_max; i++)
11185 args[i] = Qnil;
11187 plist[0] = intern_c_string (":name");
11188 plist[1] = args[coding_arg_name] = Qno_conversion;
11189 plist[2] = intern_c_string (":mnemonic");
11190 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11191 plist[4] = intern_c_string (":coding-type");
11192 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11193 plist[6] = intern_c_string (":ascii-compatible-p");
11194 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11195 plist[8] = intern_c_string (":default-char");
11196 plist[9] = args[coding_arg_default_char] = make_number (0);
11197 plist[10] = intern_c_string (":for-unibyte");
11198 plist[11] = args[coding_arg_for_unibyte] = Qt;
11199 plist[12] = intern_c_string (":docstring");
11200 plist[13] = build_pure_c_string ("Do no conversion.\n\
11202 When you visit a file with this coding, the file is read into a\n\
11203 unibyte buffer as is, thus each byte of a file is treated as a\n\
11204 character.");
11205 plist[14] = intern_c_string (":eol-type");
11206 plist[15] = args[coding_arg_eol_type] = Qunix;
11207 args[coding_arg_plist] = Flist (16, plist);
11208 Fdefine_coding_system_internal (coding_arg_max, args);
11210 plist[1] = args[coding_arg_name] = Qundecided;
11211 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11212 plist[5] = args[coding_arg_coding_type] = Qundecided;
11213 /* This is already set.
11214 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11215 plist[8] = intern_c_string (":charset-list");
11216 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11217 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11218 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11219 plist[15] = args[coding_arg_eol_type] = Qnil;
11220 args[coding_arg_plist] = Flist (16, plist);
11221 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11224 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11227 int i;
11229 for (i = 0; i < coding_category_max; i++)
11230 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11232 #if defined (DOS_NT)
11233 system_eol_type = Qdos;
11234 #else
11235 system_eol_type = Qunix;
11236 #endif
11237 staticpro (&system_eol_type);
11240 char *
11241 emacs_strerror (int error_number)
11243 char *str;
11245 synchronize_system_messages_locale ();
11246 str = strerror (error_number);
11248 if (! NILP (Vlocale_coding_system))
11250 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11251 Vlocale_coding_system,
11253 str = SSDATA (dec);
11256 return str;
11259 #endif /* emacs */