New unwind-protect flavors to better type-check C callbacks.
[emacs.git] / src / coding.c
blobe779197bbdedc0f465fc5c495c9b1cb8e97718e0
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
525 /* Index for each coding category in `coding_categories' */
527 enum coding_category
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
643 /*** Commonly used macros and functions ***/
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
652 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
654 static int
655 encode_inhibit_flag (Lisp_Object flag)
657 return NILP (flag) ? -1 : EQ (flag, Qt);
660 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
661 1 means yes, -1 means no, 0 means ask the user variable VAR. */
663 static bool
664 inhibit_flag (int encoded_flag, bool var)
666 return 0 < encoded_flag + var;
669 #define CODING_GET_INFO(coding, attrs, charset_list) \
670 do { \
671 (attrs) = CODING_ID_ATTRS ((coding)->id); \
672 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
673 } while (0)
675 static void
676 CHECK_NATNUM_CAR (Lisp_Object x)
678 Lisp_Object tmp = XCAR (x);
679 CHECK_NATNUM (tmp);
680 XSETCAR (x, tmp);
683 static void
684 CHECK_NATNUM_CDR (Lisp_Object x)
686 Lisp_Object tmp = XCDR (x);
687 CHECK_NATNUM (tmp);
688 XSETCDR (x, tmp);
692 /* Safely get one byte from the source text pointed by SRC which ends
693 at SRC_END, and set C to that byte. If there are not enough bytes
694 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
695 and a multibyte character is found at SRC, set C to the
696 negative value of the character code. The caller should declare
697 and set these variables appropriately in advance:
698 src, src_end, multibytep */
700 #define ONE_MORE_BYTE(c) \
701 do { \
702 if (src == src_end) \
704 if (src_base < src) \
705 record_conversion_result \
706 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
707 goto no_more_source; \
709 c = *src++; \
710 if (multibytep && (c & 0x80)) \
712 if ((c & 0xFE) == 0xC0) \
713 c = ((c & 1) << 6) | *src++; \
714 else \
716 src--; \
717 c = - string_char (src, &src, NULL); \
718 record_conversion_result \
719 (coding, CODING_RESULT_INVALID_SRC); \
722 consumed_chars++; \
723 } while (0)
725 /* Safely get two bytes from the source text pointed by SRC which ends
726 at SRC_END, and set C1 and C2 to those bytes while skipping the
727 heading multibyte characters. If there are not enough bytes in the
728 source, it jumps to 'no_more_source'. If MULTIBYTEP and
729 a multibyte character is found for C2, set C2 to the negative value
730 of the character code. The caller should declare and set these
731 variables appropriately in advance:
732 src, src_end, multibytep
733 It is intended that this macro is used in detect_coding_utf_16. */
735 #define TWO_MORE_BYTES(c1, c2) \
736 do { \
737 do { \
738 if (src == src_end) \
739 goto no_more_source; \
740 c1 = *src++; \
741 if (multibytep && (c1 & 0x80)) \
743 if ((c1 & 0xFE) == 0xC0) \
744 c1 = ((c1 & 1) << 6) | *src++; \
745 else \
747 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
748 c1 = -1; \
751 } while (c1 < 0); \
752 if (src == src_end) \
753 goto no_more_source; \
754 c2 = *src++; \
755 if (multibytep && (c2 & 0x80)) \
757 if ((c2 & 0xFE) == 0xC0) \
758 c2 = ((c2 & 1) << 6) | *src++; \
759 else \
760 c2 = -1; \
762 } while (0)
765 /* Store a byte C in the place pointed by DST and increment DST to the
766 next free point, and increment PRODUCED_CHARS. The caller should
767 assure that C is 0..127, and declare and set the variable `dst'
768 appropriately in advance.
772 #define EMIT_ONE_ASCII_BYTE(c) \
773 do { \
774 produced_chars++; \
775 *dst++ = (c); \
776 } while (0)
779 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
781 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
782 do { \
783 produced_chars += 2; \
784 *dst++ = (c1), *dst++ = (c2); \
785 } while (0)
788 /* Store a byte C in the place pointed by DST and increment DST to the
789 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
790 store in an appropriate multibyte form. The caller should
791 declare and set the variables `dst' and `multibytep' appropriately
792 in advance. */
794 #define EMIT_ONE_BYTE(c) \
795 do { \
796 produced_chars++; \
797 if (multibytep) \
799 unsigned ch = (c); \
800 if (ch >= 0x80) \
801 ch = BYTE8_TO_CHAR (ch); \
802 CHAR_STRING_ADVANCE (ch, dst); \
804 else \
805 *dst++ = (c); \
806 } while (0)
809 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
811 #define EMIT_TWO_BYTES(c1, c2) \
812 do { \
813 produced_chars += 2; \
814 if (multibytep) \
816 unsigned ch; \
818 ch = (c1); \
819 if (ch >= 0x80) \
820 ch = BYTE8_TO_CHAR (ch); \
821 CHAR_STRING_ADVANCE (ch, dst); \
822 ch = (c2); \
823 if (ch >= 0x80) \
824 ch = BYTE8_TO_CHAR (ch); \
825 CHAR_STRING_ADVANCE (ch, dst); \
827 else \
829 *dst++ = (c1); \
830 *dst++ = (c2); \
832 } while (0)
835 #define EMIT_THREE_BYTES(c1, c2, c3) \
836 do { \
837 EMIT_ONE_BYTE (c1); \
838 EMIT_TWO_BYTES (c2, c3); \
839 } while (0)
842 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
843 do { \
844 EMIT_TWO_BYTES (c1, c2); \
845 EMIT_TWO_BYTES (c3, c4); \
846 } while (0)
849 static void
850 record_conversion_result (struct coding_system *coding,
851 enum coding_result_code result)
853 coding->result = result;
854 switch (result)
856 case CODING_RESULT_INSUFFICIENT_SRC:
857 Vlast_code_conversion_error = Qinsufficient_source;
858 break;
859 case CODING_RESULT_INVALID_SRC:
860 Vlast_code_conversion_error = Qinvalid_source;
861 break;
862 case CODING_RESULT_INTERRUPT:
863 Vlast_code_conversion_error = Qinterrupted;
864 break;
865 case CODING_RESULT_INSUFFICIENT_DST:
866 /* Don't record this error in Vlast_code_conversion_error
867 because it happens just temporarily and is resolved when the
868 whole conversion is finished. */
869 break;
870 case CODING_RESULT_SUCCESS:
871 break;
872 default:
873 Vlast_code_conversion_error = intern ("Unknown error");
877 /* These wrapper macros are used to preserve validity of pointers into
878 buffer text across calls to decode_char, encode_char, etc, which
879 could cause relocation of buffers if it loads a charset map,
880 because loading a charset map allocates large structures. */
882 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
883 do { \
884 ptrdiff_t offset; \
886 charset_map_loaded = 0; \
887 c = DECODE_CHAR (charset, code); \
888 if (charset_map_loaded \
889 && (offset = coding_change_source (coding))) \
891 src += offset; \
892 src_base += offset; \
893 src_end += offset; \
895 } while (0)
897 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
898 do { \
899 ptrdiff_t offset; \
901 charset_map_loaded = 0; \
902 code = ENCODE_CHAR (charset, c); \
903 if (charset_map_loaded \
904 && (offset = coding_change_destination (coding))) \
906 dst += offset; \
907 dst_end += offset; \
909 } while (0)
911 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
912 do { \
913 ptrdiff_t offset; \
915 charset_map_loaded = 0; \
916 charset = char_charset (c, charset_list, code_return); \
917 if (charset_map_loaded \
918 && (offset = coding_change_destination (coding))) \
920 dst += offset; \
921 dst_end += offset; \
923 } while (0)
925 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
926 do { \
927 ptrdiff_t offset; \
929 charset_map_loaded = 0; \
930 result = CHAR_CHARSET_P (c, charset); \
931 if (charset_map_loaded \
932 && (offset = coding_change_destination (coding))) \
934 dst += offset; \
935 dst_end += offset; \
937 } while (0)
940 /* If there are at least BYTES length of room at dst, allocate memory
941 for coding->destination and update dst and dst_end. We don't have
942 to take care of coding->source which will be relocated. It is
943 handled by calling coding_set_source in encode_coding. */
945 #define ASSURE_DESTINATION(bytes) \
946 do { \
947 if (dst + (bytes) >= dst_end) \
949 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
951 dst = alloc_destination (coding, more_bytes, dst); \
952 dst_end = coding->destination + coding->dst_bytes; \
954 } while (0)
957 /* Store multibyte form of the character C in P, and advance P to the
958 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
959 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
960 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
962 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
964 /* Return the character code of character whose multibyte form is at
965 P, and advance P to the end of the multibyte form. This used to be
966 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
967 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
969 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
971 /* Set coding->source from coding->src_object. */
973 static void
974 coding_set_source (struct coding_system *coding)
976 if (BUFFERP (coding->src_object))
978 struct buffer *buf = XBUFFER (coding->src_object);
980 if (coding->src_pos < 0)
981 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
982 else
983 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
985 else if (STRINGP (coding->src_object))
987 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
989 else
991 /* Otherwise, the source is C string and is never relocated
992 automatically. Thus we don't have to update anything. */
997 /* Set coding->source from coding->src_object, and return how many
998 bytes coding->source was changed. */
1000 static ptrdiff_t
1001 coding_change_source (struct coding_system *coding)
1003 const unsigned char *orig = coding->source;
1004 coding_set_source (coding);
1005 return coding->source - orig;
1009 /* Set coding->destination from coding->dst_object. */
1011 static void
1012 coding_set_destination (struct coding_system *coding)
1014 if (BUFFERP (coding->dst_object))
1016 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1018 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1019 coding->dst_bytes = (GAP_END_ADDR
1020 - (coding->src_bytes - coding->consumed)
1021 - coding->destination);
1023 else
1025 /* We are sure that coding->dst_pos_byte is before the gap
1026 of the buffer. */
1027 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1028 + coding->dst_pos_byte - BEG_BYTE);
1029 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1030 - coding->destination);
1033 else
1035 /* Otherwise, the destination is C string and is never relocated
1036 automatically. Thus we don't have to update anything. */
1041 /* Set coding->destination from coding->dst_object, and return how
1042 many bytes coding->destination was changed. */
1044 static ptrdiff_t
1045 coding_change_destination (struct coding_system *coding)
1047 const unsigned char *orig = coding->destination;
1048 coding_set_destination (coding);
1049 return coding->destination - orig;
1053 static void
1054 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1056 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1057 string_overflow ();
1058 coding->destination = xrealloc (coding->destination,
1059 coding->dst_bytes + bytes);
1060 coding->dst_bytes += bytes;
1063 static void
1064 coding_alloc_by_making_gap (struct coding_system *coding,
1065 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1067 if (EQ (coding->src_object, coding->dst_object))
1069 /* The gap may contain the produced data at the head and not-yet
1070 consumed data at the tail. To preserve those data, we at
1071 first make the gap size to zero, then increase the gap
1072 size. */
1073 ptrdiff_t add = GAP_SIZE;
1075 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1076 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1077 make_gap (bytes);
1078 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1079 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1081 else
1082 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1086 static unsigned char *
1087 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1088 unsigned char *dst)
1090 ptrdiff_t offset = dst - coding->destination;
1092 if (BUFFERP (coding->dst_object))
1094 struct buffer *buf = XBUFFER (coding->dst_object);
1096 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1098 else
1099 coding_alloc_by_realloc (coding, nbytes);
1100 coding_set_destination (coding);
1101 dst = coding->destination + offset;
1102 return dst;
1105 /** Macros for annotations. */
1107 /* An annotation data is stored in the array coding->charbuf in this
1108 format:
1109 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1110 LENGTH is the number of elements in the annotation.
1111 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1112 NCHARS is the number of characters in the text annotated.
1114 The format of the following elements depend on ANNOTATION_MASK.
1116 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1117 follows:
1118 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1120 NBYTES is the number of bytes specified in the header part of
1121 old-style emacs-mule encoding, or 0 for the other kind of
1122 composition.
1124 METHOD is one of enum composition_method.
1126 Optional COMPOSITION-COMPONENTS are characters and composition
1127 rules.
1129 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1130 follows.
1132 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1133 recover from an invalid annotation, and should be skipped by
1134 produce_annotation. */
1136 /* Maximum length of the header of annotation data. */
1137 #define MAX_ANNOTATION_LENGTH 5
1139 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1140 do { \
1141 *(buf)++ = -(len); \
1142 *(buf)++ = (mask); \
1143 *(buf)++ = (nchars); \
1144 coding->annotated = 1; \
1145 } while (0);
1147 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1148 do { \
1149 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1150 *buf++ = nbytes; \
1151 *buf++ = method; \
1152 } while (0)
1155 #define ADD_CHARSET_DATA(buf, nchars, id) \
1156 do { \
1157 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1158 *buf++ = id; \
1159 } while (0)
1162 /* Bitmasks for coding->eol_seen. */
1164 #define EOL_SEEN_NONE 0
1165 #define EOL_SEEN_LF 1
1166 #define EOL_SEEN_CR 2
1167 #define EOL_SEEN_CRLF 4
1170 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1175 /*** 3. UTF-8 ***/
1177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1178 Return true if a text is encoded in UTF-8. */
1180 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1181 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1182 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1183 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1184 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1185 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1187 #define UTF_8_BOM_1 0xEF
1188 #define UTF_8_BOM_2 0xBB
1189 #define UTF_8_BOM_3 0xBF
1191 /* Unlike the other detect_coding_XXX, this function counts number of
1192 characters and check EOL format. */
1194 static bool
1195 detect_coding_utf_8 (struct coding_system *coding,
1196 struct coding_detection_info *detect_info)
1198 const unsigned char *src = coding->source, *src_base;
1199 const unsigned char *src_end = coding->source + coding->src_bytes;
1200 bool multibytep = coding->src_multibyte;
1201 ptrdiff_t consumed_chars = 0;
1202 bool bom_found = 0;
1203 int nchars = coding->head_ascii;
1204 int eol_seen = coding->eol_seen;
1206 detect_info->checked |= CATEGORY_MASK_UTF_8;
1207 /* A coding system of this category is always ASCII compatible. */
1208 src += nchars;
1210 if (src == coding->source /* BOM should be at the head. */
1211 && src + 3 < src_end /* BOM is 3-byte long. */
1212 && src[0] == UTF_8_BOM_1
1213 && src[1] == UTF_8_BOM_2
1214 && src[2] == UTF_8_BOM_3)
1216 bom_found = 1;
1217 src += 3;
1218 nchars++;
1221 while (1)
1223 int c, c1, c2, c3, c4;
1225 src_base = src;
1226 ONE_MORE_BYTE (c);
1227 if (c < 0 || UTF_8_1_OCTET_P (c))
1229 nchars++;
1230 if (c == '\r')
1232 if (src < src_end && *src == '\n')
1234 eol_seen |= EOL_SEEN_CRLF;
1235 src++;
1236 nchars++;
1238 else
1239 eol_seen |= EOL_SEEN_CR;
1241 else if (c == '\n')
1242 eol_seen |= EOL_SEEN_LF;
1243 continue;
1245 ONE_MORE_BYTE (c1);
1246 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1247 break;
1248 if (UTF_8_2_OCTET_LEADING_P (c))
1250 nchars++;
1251 continue;
1253 ONE_MORE_BYTE (c2);
1254 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1255 break;
1256 if (UTF_8_3_OCTET_LEADING_P (c))
1258 nchars++;
1259 continue;
1261 ONE_MORE_BYTE (c3);
1262 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1263 break;
1264 if (UTF_8_4_OCTET_LEADING_P (c))
1266 nchars++;
1267 continue;
1269 ONE_MORE_BYTE (c4);
1270 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1271 break;
1272 if (UTF_8_5_OCTET_LEADING_P (c))
1274 nchars++;
1275 continue;
1277 break;
1279 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1280 return 0;
1282 no_more_source:
1283 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1285 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1286 return 0;
1288 if (bom_found)
1290 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1291 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1293 else
1295 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1296 if (nchars < src_end - coding->source)
1297 /* The found characters are less than source bytes, which
1298 means that we found a valid non-ASCII characters. */
1299 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1301 coding->detected_utf8_chars = nchars;
1302 return 1;
1306 static void
1307 decode_coding_utf_8 (struct coding_system *coding)
1309 const unsigned char *src = coding->source + coding->consumed;
1310 const unsigned char *src_end = coding->source + coding->src_bytes;
1311 const unsigned char *src_base;
1312 int *charbuf = coding->charbuf + coding->charbuf_used;
1313 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1314 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1315 bool multibytep = coding->src_multibyte;
1316 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1317 bool eol_dos
1318 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1319 int byte_after_cr = -1;
1321 if (bom != utf_without_bom)
1323 int c1, c2, c3;
1325 src_base = src;
1326 ONE_MORE_BYTE (c1);
1327 if (! UTF_8_3_OCTET_LEADING_P (c1))
1328 src = src_base;
1329 else
1331 ONE_MORE_BYTE (c2);
1332 if (! UTF_8_EXTRA_OCTET_P (c2))
1333 src = src_base;
1334 else
1336 ONE_MORE_BYTE (c3);
1337 if (! UTF_8_EXTRA_OCTET_P (c3))
1338 src = src_base;
1339 else
1341 if ((c1 != UTF_8_BOM_1)
1342 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1343 src = src_base;
1344 else
1345 CODING_UTF_8_BOM (coding) = utf_without_bom;
1350 CODING_UTF_8_BOM (coding) = utf_without_bom;
1352 while (1)
1354 int c, c1, c2, c3, c4, c5;
1356 src_base = src;
1357 consumed_chars_base = consumed_chars;
1359 if (charbuf >= charbuf_end)
1361 if (byte_after_cr >= 0)
1362 src_base--;
1363 break;
1366 if (byte_after_cr >= 0)
1367 c1 = byte_after_cr, byte_after_cr = -1;
1368 else
1369 ONE_MORE_BYTE (c1);
1370 if (c1 < 0)
1372 c = - c1;
1374 else if (UTF_8_1_OCTET_P (c1))
1376 if (eol_dos && c1 == '\r')
1377 ONE_MORE_BYTE (byte_after_cr);
1378 c = c1;
1380 else
1382 ONE_MORE_BYTE (c2);
1383 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1384 goto invalid_code;
1385 if (UTF_8_2_OCTET_LEADING_P (c1))
1387 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1388 /* Reject overlong sequences here and below. Encoders
1389 producing them are incorrect, they can be misleading,
1390 and they mess up read/write invariance. */
1391 if (c < 128)
1392 goto invalid_code;
1394 else
1396 ONE_MORE_BYTE (c3);
1397 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1398 goto invalid_code;
1399 if (UTF_8_3_OCTET_LEADING_P (c1))
1401 c = (((c1 & 0xF) << 12)
1402 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1403 if (c < 0x800
1404 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1405 goto invalid_code;
1407 else
1409 ONE_MORE_BYTE (c4);
1410 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1411 goto invalid_code;
1412 if (UTF_8_4_OCTET_LEADING_P (c1))
1414 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1415 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1416 if (c < 0x10000)
1417 goto invalid_code;
1419 else
1421 ONE_MORE_BYTE (c5);
1422 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1423 goto invalid_code;
1424 if (UTF_8_5_OCTET_LEADING_P (c1))
1426 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1427 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1428 | (c5 & 0x3F));
1429 if ((c > MAX_CHAR) || (c < 0x200000))
1430 goto invalid_code;
1432 else
1433 goto invalid_code;
1439 *charbuf++ = c;
1440 continue;
1442 invalid_code:
1443 src = src_base;
1444 consumed_chars = consumed_chars_base;
1445 ONE_MORE_BYTE (c);
1446 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1447 coding->errors++;
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
1457 static bool
1458 encode_coding_utf_8 (struct coding_system *coding)
1460 bool multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 ptrdiff_t produced_chars = 0;
1466 int c;
1468 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1470 ASSURE_DESTINATION (3);
1471 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1472 CODING_UTF_8_BOM (coding) = utf_without_bom;
1475 if (multibytep)
1477 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1479 while (charbuf < charbuf_end)
1481 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1483 ASSURE_DESTINATION (safe_room);
1484 c = *charbuf++;
1485 if (CHAR_BYTE8_P (c))
1487 c = CHAR_TO_BYTE8 (c);
1488 EMIT_ONE_BYTE (c);
1490 else
1492 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1493 for (p = str; p < pend; p++)
1494 EMIT_ONE_BYTE (*p);
1498 else
1500 int safe_room = MAX_MULTIBYTE_LENGTH;
1502 while (charbuf < charbuf_end)
1504 ASSURE_DESTINATION (safe_room);
1505 c = *charbuf++;
1506 if (CHAR_BYTE8_P (c))
1507 *dst++ = CHAR_TO_BYTE8 (c);
1508 else
1509 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1510 produced_chars++;
1513 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1514 coding->produced_char += produced_chars;
1515 coding->produced = dst - coding->destination;
1516 return 0;
1520 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1521 Return true if a text is encoded in one of UTF-16 based coding systems. */
1523 #define UTF_16_HIGH_SURROGATE_P(val) \
1524 (((val) & 0xFC00) == 0xD800)
1526 #define UTF_16_LOW_SURROGATE_P(val) \
1527 (((val) & 0xFC00) == 0xDC00)
1530 static bool
1531 detect_coding_utf_16 (struct coding_system *coding,
1532 struct coding_detection_info *detect_info)
1534 const unsigned char *src = coding->source;
1535 const unsigned char *src_end = coding->source + coding->src_bytes;
1536 bool multibytep = coding->src_multibyte;
1537 int c1, c2;
1539 detect_info->checked |= CATEGORY_MASK_UTF_16;
1540 if (coding->mode & CODING_MODE_LAST_BLOCK
1541 && (coding->src_chars & 1))
1543 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1544 return 0;
1547 TWO_MORE_BYTES (c1, c2);
1548 if ((c1 == 0xFF) && (c2 == 0xFE))
1550 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1551 | CATEGORY_MASK_UTF_16_AUTO);
1552 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1553 | CATEGORY_MASK_UTF_16_BE_NOSIG
1554 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1556 else if ((c1 == 0xFE) && (c2 == 0xFF))
1558 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1559 | CATEGORY_MASK_UTF_16_AUTO);
1560 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1561 | CATEGORY_MASK_UTF_16_BE_NOSIG
1562 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1564 else if (c2 < 0)
1566 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1567 return 0;
1569 else
1571 /* We check the dispersion of Eth and Oth bytes where E is even and
1572 O is odd. If both are high, we assume binary data.*/
1573 unsigned char e[256], o[256];
1574 unsigned e_num = 1, o_num = 1;
1576 memset (e, 0, 256);
1577 memset (o, 0, 256);
1578 e[c1] = 1;
1579 o[c2] = 1;
1581 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1582 |CATEGORY_MASK_UTF_16_BE
1583 | CATEGORY_MASK_UTF_16_LE);
1585 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1586 != CATEGORY_MASK_UTF_16)
1588 TWO_MORE_BYTES (c1, c2);
1589 if (c2 < 0)
1590 break;
1591 if (! e[c1])
1593 e[c1] = 1;
1594 e_num++;
1595 if (e_num >= 128)
1596 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1598 if (! o[c2])
1600 o[c2] = 1;
1601 o_num++;
1602 if (o_num >= 128)
1603 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1606 return 0;
1609 no_more_source:
1610 return 1;
1613 static void
1614 decode_coding_utf_16 (struct coding_system *coding)
1616 const unsigned char *src = coding->source + coding->consumed;
1617 const unsigned char *src_end = coding->source + coding->src_bytes;
1618 const unsigned char *src_base;
1619 int *charbuf = coding->charbuf + coding->charbuf_used;
1620 /* We may produces at most 3 chars in one loop. */
1621 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1622 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1623 bool multibytep = coding->src_multibyte;
1624 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1625 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1626 int surrogate = CODING_UTF_16_SURROGATE (coding);
1627 bool eol_dos
1628 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1629 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1631 if (bom == utf_with_bom)
1633 int c, c1, c2;
1635 src_base = src;
1636 ONE_MORE_BYTE (c1);
1637 ONE_MORE_BYTE (c2);
1638 c = (c1 << 8) | c2;
1640 if (endian == utf_16_big_endian
1641 ? c != 0xFEFF : c != 0xFFFE)
1643 /* The first two bytes are not BOM. Treat them as bytes
1644 for a normal character. */
1645 src = src_base;
1646 coding->errors++;
1648 CODING_UTF_16_BOM (coding) = utf_without_bom;
1650 else if (bom == utf_detect_bom)
1652 /* We have already tried to detect BOM and failed in
1653 detect_coding. */
1654 CODING_UTF_16_BOM (coding) = utf_without_bom;
1657 while (1)
1659 int c, c1, c2;
1661 src_base = src;
1662 consumed_chars_base = consumed_chars;
1664 if (charbuf >= charbuf_end)
1666 if (byte_after_cr1 >= 0)
1667 src_base -= 2;
1668 break;
1671 if (byte_after_cr1 >= 0)
1672 c1 = byte_after_cr1, byte_after_cr1 = -1;
1673 else
1674 ONE_MORE_BYTE (c1);
1675 if (c1 < 0)
1677 *charbuf++ = -c1;
1678 continue;
1680 if (byte_after_cr2 >= 0)
1681 c2 = byte_after_cr2, byte_after_cr2 = -1;
1682 else
1683 ONE_MORE_BYTE (c2);
1684 if (c2 < 0)
1686 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1687 *charbuf++ = -c2;
1688 continue;
1690 c = (endian == utf_16_big_endian
1691 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1693 if (surrogate)
1695 if (! UTF_16_LOW_SURROGATE_P (c))
1697 if (endian == utf_16_big_endian)
1698 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1699 else
1700 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1701 *charbuf++ = c1;
1702 *charbuf++ = c2;
1703 coding->errors++;
1704 if (UTF_16_HIGH_SURROGATE_P (c))
1705 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706 else
1707 *charbuf++ = c;
1709 else
1711 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1712 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1713 *charbuf++ = 0x10000 + c;
1716 else
1718 if (UTF_16_HIGH_SURROGATE_P (c))
1719 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1720 else
1722 if (eol_dos && c == '\r')
1724 ONE_MORE_BYTE (byte_after_cr1);
1725 ONE_MORE_BYTE (byte_after_cr2);
1727 *charbuf++ = c;
1732 no_more_source:
1733 coding->consumed_char += consumed_chars_base;
1734 coding->consumed = src_base - coding->source;
1735 coding->charbuf_used = charbuf - coding->charbuf;
1738 static bool
1739 encode_coding_utf_16 (struct coding_system *coding)
1741 bool multibytep = coding->dst_multibyte;
1742 int *charbuf = coding->charbuf;
1743 int *charbuf_end = charbuf + coding->charbuf_used;
1744 unsigned char *dst = coding->destination + coding->produced;
1745 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1746 int safe_room = 8;
1747 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1748 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1749 ptrdiff_t produced_chars = 0;
1750 int c;
1752 if (bom != utf_without_bom)
1754 ASSURE_DESTINATION (safe_room);
1755 if (big_endian)
1756 EMIT_TWO_BYTES (0xFE, 0xFF);
1757 else
1758 EMIT_TWO_BYTES (0xFF, 0xFE);
1759 CODING_UTF_16_BOM (coding) = utf_without_bom;
1762 while (charbuf < charbuf_end)
1764 ASSURE_DESTINATION (safe_room);
1765 c = *charbuf++;
1766 if (c > MAX_UNICODE_CHAR)
1767 c = coding->default_char;
1769 if (c < 0x10000)
1771 if (big_endian)
1772 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1773 else
1774 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1776 else
1778 int c1, c2;
1780 c -= 0x10000;
1781 c1 = (c >> 10) + 0xD800;
1782 c2 = (c & 0x3FF) + 0xDC00;
1783 if (big_endian)
1784 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1785 else
1786 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1789 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1790 coding->produced = dst - coding->destination;
1791 coding->produced_char += produced_chars;
1792 return 0;
1796 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1798 /* Emacs' internal format for representation of multiple character
1799 sets is a kind of multi-byte encoding, i.e. characters are
1800 represented by variable-length sequences of one-byte codes.
1802 ASCII characters and control characters (e.g. `tab', `newline') are
1803 represented by one-byte sequences which are their ASCII codes, in
1804 the range 0x00 through 0x7F.
1806 8-bit characters of the range 0x80..0x9F are represented by
1807 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1808 code + 0x20).
1810 8-bit characters of the range 0xA0..0xFF are represented by
1811 one-byte sequences which are their 8-bit code.
1813 The other characters are represented by a sequence of `base
1814 leading-code', optional `extended leading-code', and one or two
1815 `position-code's. The length of the sequence is determined by the
1816 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1817 whereas extended leading-code and position-code take the range 0xA0
1818 through 0xFF. See `charset.h' for more details about leading-code
1819 and position-code.
1821 --- CODE RANGE of Emacs' internal format ---
1822 character set range
1823 ------------- -----
1824 ascii 0x00..0x7F
1825 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1826 eight-bit-graphic 0xA0..0xBF
1827 ELSE 0x81..0x9D + [0xA0..0xFF]+
1828 ---------------------------------------------
1830 As this is the internal character representation, the format is
1831 usually not used externally (i.e. in a file or in a data sent to a
1832 process). But, it is possible to have a text externally in this
1833 format (i.e. by encoding by the coding system `emacs-mule').
1835 In that case, a sequence of one-byte codes has a slightly different
1836 form.
1838 At first, all characters in eight-bit-control are represented by
1839 one-byte sequences which are their 8-bit code.
1841 Next, character composition data are represented by the byte
1842 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1843 where,
1844 METHOD is 0xF2 plus one of composition method (enum
1845 composition_method),
1847 BYTES is 0xA0 plus a byte length of this composition data,
1849 CHARS is 0xA0 plus a number of characters composed by this
1850 data,
1852 COMPONENTs are characters of multibyte form or composition
1853 rules encoded by two-byte of ASCII codes.
1855 In addition, for backward compatibility, the following formats are
1856 also recognized as composition data on decoding.
1858 0x80 MSEQ ...
1859 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1861 Here,
1862 MSEQ is a multibyte form but in these special format:
1863 ASCII: 0xA0 ASCII_CODE+0x80,
1864 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1865 RULE is a one byte code of the range 0xA0..0xF0 that
1866 represents a composition rule.
1869 char emacs_mule_bytes[256];
1872 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1873 Return true if a text is encoded in 'emacs-mule'. */
1875 static bool
1876 detect_coding_emacs_mule (struct coding_system *coding,
1877 struct coding_detection_info *detect_info)
1879 const unsigned char *src = coding->source, *src_base;
1880 const unsigned char *src_end = coding->source + coding->src_bytes;
1881 bool multibytep = coding->src_multibyte;
1882 ptrdiff_t consumed_chars = 0;
1883 int c;
1884 int found = 0;
1886 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1887 /* A coding system of this category is always ASCII compatible. */
1888 src += coding->head_ascii;
1890 while (1)
1892 src_base = src;
1893 ONE_MORE_BYTE (c);
1894 if (c < 0)
1895 continue;
1896 if (c == 0x80)
1898 /* Perhaps the start of composite character. We simply skip
1899 it because analyzing it is too heavy for detecting. But,
1900 at least, we check that the composite character
1901 constitutes of more than 4 bytes. */
1902 const unsigned char *src_start;
1904 repeat:
1905 src_start = src;
1908 ONE_MORE_BYTE (c);
1910 while (c >= 0xA0);
1912 if (src - src_start <= 4)
1913 break;
1914 found = CATEGORY_MASK_EMACS_MULE;
1915 if (c == 0x80)
1916 goto repeat;
1919 if (c < 0x80)
1921 if (c < 0x20
1922 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1923 break;
1925 else
1927 int more_bytes = emacs_mule_bytes[c] - 1;
1929 while (more_bytes > 0)
1931 ONE_MORE_BYTE (c);
1932 if (c < 0xA0)
1934 src--; /* Unread the last byte. */
1935 break;
1937 more_bytes--;
1939 if (more_bytes != 0)
1940 break;
1941 found = CATEGORY_MASK_EMACS_MULE;
1944 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945 return 0;
1947 no_more_source:
1948 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1950 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1951 return 0;
1953 detect_info->found |= found;
1954 return 1;
1958 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1959 character. If CMP_STATUS indicates that we must expect MSEQ or
1960 RULE described above, decode it and return the negative value of
1961 the decoded character or rule. If an invalid byte is found, return
1962 -1. If SRC is too short, return -2. */
1964 static int
1965 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1966 int *nbytes, int *nchars, int *id,
1967 struct composition_status *cmp_status)
1969 const unsigned char *src_end = coding->source + coding->src_bytes;
1970 const unsigned char *src_base = src;
1971 bool multibytep = coding->src_multibyte;
1972 int charset_ID;
1973 unsigned code;
1974 int c;
1975 int consumed_chars = 0;
1976 bool mseq_found = 0;
1978 ONE_MORE_BYTE (c);
1979 if (c < 0)
1981 c = -c;
1982 charset_ID = emacs_mule_charset[0];
1984 else
1986 if (c >= 0xA0)
1988 if (cmp_status->state != COMPOSING_NO
1989 && cmp_status->old_form)
1991 if (cmp_status->state == COMPOSING_CHAR)
1993 if (c == 0xA0)
1995 ONE_MORE_BYTE (c);
1996 c -= 0x80;
1997 if (c < 0)
1998 goto invalid_code;
2000 else
2001 c -= 0x20;
2002 mseq_found = 1;
2004 else
2006 *nbytes = src - src_base;
2007 *nchars = consumed_chars;
2008 return -c;
2011 else
2012 goto invalid_code;
2015 switch (emacs_mule_bytes[c])
2017 case 2:
2018 if ((charset_ID = emacs_mule_charset[c]) < 0)
2019 goto invalid_code;
2020 ONE_MORE_BYTE (c);
2021 if (c < 0xA0)
2022 goto invalid_code;
2023 code = c & 0x7F;
2024 break;
2026 case 3:
2027 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2028 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2030 ONE_MORE_BYTE (c);
2031 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2032 goto invalid_code;
2033 ONE_MORE_BYTE (c);
2034 if (c < 0xA0)
2035 goto invalid_code;
2036 code = c & 0x7F;
2038 else
2040 if ((charset_ID = emacs_mule_charset[c]) < 0)
2041 goto invalid_code;
2042 ONE_MORE_BYTE (c);
2043 if (c < 0xA0)
2044 goto invalid_code;
2045 code = (c & 0x7F) << 8;
2046 ONE_MORE_BYTE (c);
2047 if (c < 0xA0)
2048 goto invalid_code;
2049 code |= c & 0x7F;
2051 break;
2053 case 4:
2054 ONE_MORE_BYTE (c);
2055 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2056 goto invalid_code;
2057 ONE_MORE_BYTE (c);
2058 if (c < 0xA0)
2059 goto invalid_code;
2060 code = (c & 0x7F) << 8;
2061 ONE_MORE_BYTE (c);
2062 if (c < 0xA0)
2063 goto invalid_code;
2064 code |= c & 0x7F;
2065 break;
2067 case 1:
2068 code = c;
2069 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2070 break;
2072 default:
2073 emacs_abort ();
2075 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2076 CHARSET_FROM_ID (charset_ID), code, c);
2077 if (c < 0)
2078 goto invalid_code;
2080 *nbytes = src - src_base;
2081 *nchars = consumed_chars;
2082 if (id)
2083 *id = charset_ID;
2084 return (mseq_found ? -c : c);
2086 no_more_source:
2087 return -2;
2089 invalid_code:
2090 return -1;
2094 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2096 /* Handle these composition sequence ('|': the end of header elements,
2097 BYTES and CHARS >= 0xA0):
2099 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2100 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2101 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2103 and these old form:
2105 (4) relative composition: 0x80 | MSEQ ... MSEQ
2106 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2108 When the starter 0x80 and the following header elements are found,
2109 this annotation header is produced.
2111 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2113 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2114 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2116 Then, upon reading the following elements, these codes are produced
2117 until the composition end is found:
2119 (1) CHAR ... CHAR
2120 (2) ALT ... ALT CHAR ... CHAR
2121 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2122 (4) CHAR ... CHAR
2123 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2125 When the composition end is found, LENGTH and NCHARS in the
2126 annotation header is updated as below:
2128 (1) LENGTH: unchanged, NCHARS: unchanged
2129 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2130 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2131 (4) LENGTH: unchanged, NCHARS: number of CHARs
2132 (5) LENGTH: unchanged, NCHARS: number of CHARs
2134 If an error is found while composing, the annotation header is
2135 changed to the original composition header (plus filler -1s) as
2136 below:
2138 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2139 (5) [ 0x80 0xFF -1 -1- -1 ]
2141 and the sequence [ -2 DECODED-RULE ] is changed to the original
2142 byte sequence as below:
2143 o the original byte sequence is B: [ B -1 ]
2144 o the original byte sequence is B1 B2: [ B1 B2 ]
2146 Most of the routines are implemented by macros because many
2147 variables and labels in the caller decode_coding_emacs_mule must be
2148 accessible, and they are usually called just once (thus doesn't
2149 increase the size of compiled object). */
2151 /* Decode a composition rule represented by C as a component of
2152 composition sequence of Emacs 20 style. Set RULE to the decoded
2153 rule. */
2155 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2156 do { \
2157 int gref, nref; \
2159 c -= 0xA0; \
2160 if (c < 0 || c >= 81) \
2161 goto invalid_code; \
2162 gref = c / 9, nref = c % 9; \
2163 if (gref == 4) gref = 10; \
2164 if (nref == 4) nref = 10; \
2165 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2166 } while (0)
2169 /* Decode a composition rule represented by C and the following byte
2170 at SRC as a component of composition sequence of Emacs 21 style.
2171 Set RULE to the decoded rule. */
2173 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2174 do { \
2175 int gref, nref; \
2177 gref = c - 0x20; \
2178 if (gref < 0 || gref >= 81) \
2179 goto invalid_code; \
2180 ONE_MORE_BYTE (c); \
2181 nref = c - 0x20; \
2182 if (nref < 0 || nref >= 81) \
2183 goto invalid_code; \
2184 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2185 } while (0)
2188 /* Start of Emacs 21 style format. The first three bytes at SRC are
2189 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2190 byte length of this composition information, CHARS is the number of
2191 characters composed by this composition. */
2193 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2194 do { \
2195 enum composition_method method = c - 0xF2; \
2196 int nbytes, nchars; \
2198 ONE_MORE_BYTE (c); \
2199 if (c < 0) \
2200 goto invalid_code; \
2201 nbytes = c - 0xA0; \
2202 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2203 goto invalid_code; \
2204 ONE_MORE_BYTE (c); \
2205 nchars = c - 0xA0; \
2206 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2207 goto invalid_code; \
2208 cmp_status->old_form = 0; \
2209 cmp_status->method = method; \
2210 if (method == COMPOSITION_RELATIVE) \
2211 cmp_status->state = COMPOSING_CHAR; \
2212 else \
2213 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2214 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2215 cmp_status->nchars = nchars; \
2216 cmp_status->ncomps = nbytes - 4; \
2217 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2218 } while (0)
2221 /* Start of Emacs 20 style format for relative composition. */
2223 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2224 do { \
2225 cmp_status->old_form = 1; \
2226 cmp_status->method = COMPOSITION_RELATIVE; \
2227 cmp_status->state = COMPOSING_CHAR; \
2228 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2229 cmp_status->nchars = cmp_status->ncomps = 0; \
2230 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2231 } while (0)
2234 /* Start of Emacs 20 style format for rule-base composition. */
2236 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2237 do { \
2238 cmp_status->old_form = 1; \
2239 cmp_status->method = COMPOSITION_WITH_RULE; \
2240 cmp_status->state = COMPOSING_CHAR; \
2241 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2242 cmp_status->nchars = cmp_status->ncomps = 0; \
2243 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2244 } while (0)
2247 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2248 do { \
2249 const unsigned char *current_src = src; \
2251 ONE_MORE_BYTE (c); \
2252 if (c < 0) \
2253 goto invalid_code; \
2254 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2255 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2256 DECODE_EMACS_MULE_21_COMPOSITION (); \
2257 else if (c < 0xA0) \
2258 goto invalid_code; \
2259 else if (c < 0xC0) \
2261 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2262 /* Re-read C as a composition component. */ \
2263 src = current_src; \
2265 else if (c == 0xFF) \
2266 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2267 else \
2268 goto invalid_code; \
2269 } while (0)
2271 #define EMACS_MULE_COMPOSITION_END() \
2272 do { \
2273 int idx = - cmp_status->length; \
2275 if (cmp_status->old_form) \
2276 charbuf[idx + 2] = cmp_status->nchars; \
2277 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2278 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2279 cmp_status->state = COMPOSING_NO; \
2280 } while (0)
2283 static int
2284 emacs_mule_finish_composition (int *charbuf,
2285 struct composition_status *cmp_status)
2287 int idx = - cmp_status->length;
2288 int new_chars;
2290 if (cmp_status->old_form && cmp_status->nchars > 0)
2292 charbuf[idx + 2] = cmp_status->nchars;
2293 new_chars = 0;
2294 if (cmp_status->method == COMPOSITION_WITH_RULE
2295 && cmp_status->state == COMPOSING_CHAR)
2297 /* The last rule was invalid. */
2298 int rule = charbuf[-1] + 0xA0;
2300 charbuf[-2] = BYTE8_TO_CHAR (rule);
2301 charbuf[-1] = -1;
2302 new_chars = 1;
2305 else
2307 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2309 if (cmp_status->method == COMPOSITION_WITH_RULE)
2311 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2312 charbuf[idx++] = -3;
2313 charbuf[idx++] = 0;
2314 new_chars = 1;
2316 else
2318 int nchars = charbuf[idx + 1] + 0xA0;
2319 int nbytes = charbuf[idx + 2] + 0xA0;
2321 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2322 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2323 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2324 charbuf[idx++] = -1;
2325 new_chars = 4;
2328 cmp_status->state = COMPOSING_NO;
2329 return new_chars;
2332 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2333 do { \
2334 if (cmp_status->state != COMPOSING_NO) \
2335 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2336 } while (0)
2339 static void
2340 decode_coding_emacs_mule (struct coding_system *coding)
2342 const unsigned char *src = coding->source + coding->consumed;
2343 const unsigned char *src_end = coding->source + coding->src_bytes;
2344 const unsigned char *src_base;
2345 int *charbuf = coding->charbuf + coding->charbuf_used;
2346 /* We may produce two annotations (charset and composition) in one
2347 loop and one more charset annotation at the end. */
2348 int *charbuf_end
2349 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2350 /* We can produce up to 2 characters in a loop. */
2351 - 1;
2352 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2353 bool multibytep = coding->src_multibyte;
2354 ptrdiff_t char_offset = coding->produced_char;
2355 ptrdiff_t last_offset = char_offset;
2356 int last_id = charset_ascii;
2357 bool eol_dos
2358 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2359 int byte_after_cr = -1;
2360 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2362 if (cmp_status->state != COMPOSING_NO)
2364 int i;
2366 if (charbuf_end - charbuf < cmp_status->length)
2367 emacs_abort ();
2368 for (i = 0; i < cmp_status->length; i++)
2369 *charbuf++ = cmp_status->carryover[i];
2370 coding->annotated = 1;
2373 while (1)
2375 int c, id IF_LINT (= 0);
2377 src_base = src;
2378 consumed_chars_base = consumed_chars;
2380 if (charbuf >= charbuf_end)
2382 if (byte_after_cr >= 0)
2383 src_base--;
2384 break;
2387 if (byte_after_cr >= 0)
2388 c = byte_after_cr, byte_after_cr = -1;
2389 else
2390 ONE_MORE_BYTE (c);
2392 if (c < 0 || c == 0x80)
2394 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395 if (c < 0)
2397 *charbuf++ = -c;
2398 char_offset++;
2400 else
2401 DECODE_EMACS_MULE_COMPOSITION_START ();
2402 continue;
2405 if (c < 0x80)
2407 if (eol_dos && c == '\r')
2408 ONE_MORE_BYTE (byte_after_cr);
2409 id = charset_ascii;
2410 if (cmp_status->state != COMPOSING_NO)
2412 if (cmp_status->old_form)
2413 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415 cmp_status->ncomps--;
2418 else
2420 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2421 /* emacs_mule_char can load a charset map from a file, which
2422 allocates a large structure and might cause buffer text
2423 to be relocated as result. Thus, we need to remember the
2424 original pointer to buffer text, and fix up all related
2425 pointers after the call. */
2426 const unsigned char *orig = coding->source;
2427 ptrdiff_t offset;
2429 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2430 cmp_status);
2431 offset = coding->source - orig;
2432 if (offset)
2434 src += offset;
2435 src_base += offset;
2436 src_end += offset;
2438 if (c < 0)
2440 if (c == -1)
2441 goto invalid_code;
2442 if (c == -2)
2443 break;
2445 src = src_base + nbytes;
2446 consumed_chars = consumed_chars_base + nchars;
2447 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2448 cmp_status->ncomps -= nchars;
2451 /* Now if C >= 0, we found a normally encoded character, if C <
2452 0, we found an old-style composition component character or
2453 rule. */
2455 if (cmp_status->state == COMPOSING_NO)
2457 if (last_id != id)
2459 if (last_id != charset_ascii)
2460 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2461 last_id);
2462 last_id = id;
2463 last_offset = char_offset;
2465 *charbuf++ = c;
2466 char_offset++;
2468 else if (cmp_status->state == COMPOSING_CHAR)
2470 if (cmp_status->old_form)
2472 if (c >= 0)
2474 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475 *charbuf++ = c;
2476 char_offset++;
2478 else
2480 *charbuf++ = -c;
2481 cmp_status->nchars++;
2482 cmp_status->length++;
2483 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2484 EMACS_MULE_COMPOSITION_END ();
2485 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2486 cmp_status->state = COMPOSING_RULE;
2489 else
2491 *charbuf++ = c;
2492 cmp_status->length++;
2493 cmp_status->nchars--;
2494 if (cmp_status->nchars == 0)
2495 EMACS_MULE_COMPOSITION_END ();
2498 else if (cmp_status->state == COMPOSING_RULE)
2500 int rule;
2502 if (c >= 0)
2504 EMACS_MULE_COMPOSITION_END ();
2505 *charbuf++ = c;
2506 char_offset++;
2508 else
2510 c = -c;
2511 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2512 if (rule < 0)
2513 goto invalid_code;
2514 *charbuf++ = -2;
2515 *charbuf++ = rule;
2516 cmp_status->length += 2;
2517 cmp_status->state = COMPOSING_CHAR;
2520 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2522 *charbuf++ = c;
2523 cmp_status->length++;
2524 if (cmp_status->ncomps == 0)
2525 cmp_status->state = COMPOSING_CHAR;
2526 else if (cmp_status->ncomps > 0)
2528 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2529 cmp_status->state = COMPOSING_COMPONENT_RULE;
2531 else
2532 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534 else /* COMPOSING_COMPONENT_RULE */
2536 int rule;
2538 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2539 if (rule < 0)
2540 goto invalid_code;
2541 *charbuf++ = -2;
2542 *charbuf++ = rule;
2543 cmp_status->length += 2;
2544 cmp_status->ncomps--;
2545 if (cmp_status->ncomps > 0)
2546 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2547 else
2548 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550 continue;
2552 invalid_code:
2553 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2554 src = src_base;
2555 consumed_chars = consumed_chars_base;
2556 ONE_MORE_BYTE (c);
2557 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2558 char_offset++;
2559 coding->errors++;
2562 no_more_source:
2563 if (cmp_status->state != COMPOSING_NO)
2565 if (coding->mode & CODING_MODE_LAST_BLOCK)
2566 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2567 else
2569 int i;
2571 charbuf -= cmp_status->length;
2572 for (i = 0; i < cmp_status->length; i++)
2573 cmp_status->carryover[i] = charbuf[i];
2576 if (last_id != charset_ascii)
2577 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2578 coding->consumed_char += consumed_chars_base;
2579 coding->consumed = src_base - coding->source;
2580 coding->charbuf_used = charbuf - coding->charbuf;
2584 #define EMACS_MULE_LEADING_CODES(id, codes) \
2585 do { \
2586 if (id < 0xA0) \
2587 codes[0] = id, codes[1] = 0; \
2588 else if (id < 0xE0) \
2589 codes[0] = 0x9A, codes[1] = id; \
2590 else if (id < 0xF0) \
2591 codes[0] = 0x9B, codes[1] = id; \
2592 else if (id < 0xF5) \
2593 codes[0] = 0x9C, codes[1] = id; \
2594 else \
2595 codes[0] = 0x9D, codes[1] = id; \
2596 } while (0);
2599 static bool
2600 encode_coding_emacs_mule (struct coding_system *coding)
2602 bool multibytep = coding->dst_multibyte;
2603 int *charbuf = coding->charbuf;
2604 int *charbuf_end = charbuf + coding->charbuf_used;
2605 unsigned char *dst = coding->destination + coding->produced;
2606 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2607 int safe_room = 8;
2608 ptrdiff_t produced_chars = 0;
2609 Lisp_Object attrs, charset_list;
2610 int c;
2611 int preferred_charset_id = -1;
2613 CODING_GET_INFO (coding, attrs, charset_list);
2614 if (! EQ (charset_list, Vemacs_mule_charset_list))
2616 charset_list = Vemacs_mule_charset_list;
2617 ASET (attrs, coding_attr_charset_list, charset_list);
2620 while (charbuf < charbuf_end)
2622 ASSURE_DESTINATION (safe_room);
2623 c = *charbuf++;
2625 if (c < 0)
2627 /* Handle an annotation. */
2628 switch (*charbuf)
2630 case CODING_ANNOTATE_COMPOSITION_MASK:
2631 /* Not yet implemented. */
2632 break;
2633 case CODING_ANNOTATE_CHARSET_MASK:
2634 preferred_charset_id = charbuf[3];
2635 if (preferred_charset_id >= 0
2636 && NILP (Fmemq (make_number (preferred_charset_id),
2637 charset_list)))
2638 preferred_charset_id = -1;
2639 break;
2640 default:
2641 emacs_abort ();
2643 charbuf += -c - 1;
2644 continue;
2647 if (ASCII_CHAR_P (c))
2648 EMIT_ONE_ASCII_BYTE (c);
2649 else if (CHAR_BYTE8_P (c))
2651 c = CHAR_TO_BYTE8 (c);
2652 EMIT_ONE_BYTE (c);
2654 else
2656 struct charset *charset;
2657 unsigned code;
2658 int dimension;
2659 int emacs_mule_id;
2660 unsigned char leading_codes[2];
2662 if (preferred_charset_id >= 0)
2664 bool result;
2666 charset = CHARSET_FROM_ID (preferred_charset_id);
2667 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2668 if (result)
2669 code = ENCODE_CHAR (charset, c);
2670 else
2671 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2672 &code, charset);
2674 else
2675 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2676 &code, charset);
2677 if (! charset)
2679 c = coding->default_char;
2680 if (ASCII_CHAR_P (c))
2682 EMIT_ONE_ASCII_BYTE (c);
2683 continue;
2685 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2686 &code, charset);
2688 dimension = CHARSET_DIMENSION (charset);
2689 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2690 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2691 EMIT_ONE_BYTE (leading_codes[0]);
2692 if (leading_codes[1])
2693 EMIT_ONE_BYTE (leading_codes[1]);
2694 if (dimension == 1)
2695 EMIT_ONE_BYTE (code | 0x80);
2696 else
2698 code |= 0x8080;
2699 EMIT_ONE_BYTE (code >> 8);
2700 EMIT_ONE_BYTE (code & 0xFF);
2704 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2705 coding->produced_char += produced_chars;
2706 coding->produced = dst - coding->destination;
2707 return 0;
2711 /*** 7. ISO2022 handlers ***/
2713 /* The following note describes the coding system ISO2022 briefly.
2714 Since the intention of this note is to help understand the
2715 functions in this file, some parts are NOT ACCURATE or are OVERLY
2716 SIMPLIFIED. For thorough understanding, please refer to the
2717 original document of ISO2022. This is equivalent to the standard
2718 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2720 ISO2022 provides many mechanisms to encode several character sets
2721 in 7-bit and 8-bit environments. For 7-bit environments, all text
2722 is encoded using bytes less than 128. This may make the encoded
2723 text a little bit longer, but the text passes more easily through
2724 several types of gateway, some of which strip off the MSB (Most
2725 Significant Bit).
2727 There are two kinds of character sets: control character sets and
2728 graphic character sets. The former contain control characters such
2729 as `newline' and `escape' to provide control functions (control
2730 functions are also provided by escape sequences). The latter
2731 contain graphic characters such as 'A' and '-'. Emacs recognizes
2732 two control character sets and many graphic character sets.
2734 Graphic character sets are classified into one of the following
2735 four classes, according to the number of bytes (DIMENSION) and
2736 number of characters in one dimension (CHARS) of the set:
2737 - DIMENSION1_CHARS94
2738 - DIMENSION1_CHARS96
2739 - DIMENSION2_CHARS94
2740 - DIMENSION2_CHARS96
2742 In addition, each character set is assigned an identification tag,
2743 unique for each set, called the "final character" (denoted as <F>
2744 hereafter). The <F> of each character set is decided by ECMA(*)
2745 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2746 (0x30..0x3F are for private use only).
2748 Note (*): ECMA = European Computer Manufacturers Association
2750 Here are examples of graphic character sets [NAME(<F>)]:
2751 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2752 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2753 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2754 o DIMENSION2_CHARS96 -- none for the moment
2756 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2757 C0 [0x00..0x1F] -- control character plane 0
2758 GL [0x20..0x7F] -- graphic character plane 0
2759 C1 [0x80..0x9F] -- control character plane 1
2760 GR [0xA0..0xFF] -- graphic character plane 1
2762 A control character set is directly designated and invoked to C0 or
2763 C1 by an escape sequence. The most common case is that:
2764 - ISO646's control character set is designated/invoked to C0, and
2765 - ISO6429's control character set is designated/invoked to C1,
2766 and usually these designations/invocations are omitted in encoded
2767 text. In a 7-bit environment, only C0 can be used, and a control
2768 character for C1 is encoded by an appropriate escape sequence to
2769 fit into the environment. All control characters for C1 are
2770 defined to have corresponding escape sequences.
2772 A graphic character set is at first designated to one of four
2773 graphic registers (G0 through G3), then these graphic registers are
2774 invoked to GL or GR. These designations and invocations can be
2775 done independently. The most common case is that G0 is invoked to
2776 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2777 these invocations and designations are omitted in encoded text.
2778 In a 7-bit environment, only GL can be used.
2780 When a graphic character set of CHARS94 is invoked to GL, codes
2781 0x20 and 0x7F of the GL area work as control characters SPACE and
2782 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2783 be used.
2785 There are two ways of invocation: locking-shift and single-shift.
2786 With locking-shift, the invocation lasts until the next different
2787 invocation, whereas with single-shift, the invocation affects the
2788 following character only and doesn't affect the locking-shift
2789 state. Invocations are done by the following control characters or
2790 escape sequences:
2792 ----------------------------------------------------------------------
2793 abbrev function cntrl escape seq description
2794 ----------------------------------------------------------------------
2795 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2796 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2797 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2798 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2799 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2800 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2801 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2802 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2803 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2804 ----------------------------------------------------------------------
2805 (*) These are not used by any known coding system.
2807 Control characters for these functions are defined by macros
2808 ISO_CODE_XXX in `coding.h'.
2810 Designations are done by the following escape sequences:
2811 ----------------------------------------------------------------------
2812 escape sequence description
2813 ----------------------------------------------------------------------
2814 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2815 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2816 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2817 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2818 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2819 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2820 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2821 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2822 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2823 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2824 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2825 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2826 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2827 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2828 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2829 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2830 ----------------------------------------------------------------------
2832 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2833 of dimension 1, chars 94, and final character <F>, etc...
2835 Note (*): Although these designations are not allowed in ISO2022,
2836 Emacs accepts them on decoding, and produces them on encoding
2837 CHARS96 character sets in a coding system which is characterized as
2838 7-bit environment, non-locking-shift, and non-single-shift.
2840 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2841 '(' must be omitted. We refer to this as "short-form" hereafter.
2843 Now you may notice that there are a lot of ways of encoding the
2844 same multilingual text in ISO2022. Actually, there exist many
2845 coding systems such as Compound Text (used in X11's inter client
2846 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2847 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2848 localized platforms), and all of these are variants of ISO2022.
2850 In addition to the above, Emacs handles two more kinds of escape
2851 sequences: ISO6429's direction specification and Emacs' private
2852 sequence for specifying character composition.
2854 ISO6429's direction specification takes the following form:
2855 o CSI ']' -- end of the current direction
2856 o CSI '0' ']' -- end of the current direction
2857 o CSI '1' ']' -- start of left-to-right text
2858 o CSI '2' ']' -- start of right-to-left text
2859 The control character CSI (0x9B: control sequence introducer) is
2860 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2862 Character composition specification takes the following form:
2863 o ESC '0' -- start relative composition
2864 o ESC '1' -- end composition
2865 o ESC '2' -- start rule-base composition (*)
2866 o ESC '3' -- start relative composition with alternate chars (**)
2867 o ESC '4' -- start rule-base composition with alternate chars (**)
2868 Since these are not standard escape sequences of any ISO standard,
2869 the use of them with these meanings is restricted to Emacs only.
2871 (*) This form is used only in Emacs 20.7 and older versions,
2872 but newer versions can safely decode it.
2873 (**) This form is used only in Emacs 21.1 and newer versions,
2874 and older versions can't decode it.
2876 Here's a list of example usages of these composition escape
2877 sequences (categorized by `enum composition_method').
2879 COMPOSITION_RELATIVE:
2880 ESC 0 CHAR [ CHAR ] ESC 1
2881 COMPOSITION_WITH_RULE:
2882 ESC 2 CHAR [ RULE CHAR ] ESC 1
2883 COMPOSITION_WITH_ALTCHARS:
2884 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2885 COMPOSITION_WITH_RULE_ALTCHARS:
2886 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2888 static enum iso_code_class_type iso_code_class[256];
2890 #define SAFE_CHARSET_P(coding, id) \
2891 ((id) <= (coding)->max_charset_id \
2892 && (coding)->safe_charsets[id] != 255)
2894 static void
2895 setup_iso_safe_charsets (Lisp_Object attrs)
2897 Lisp_Object charset_list, safe_charsets;
2898 Lisp_Object request;
2899 Lisp_Object reg_usage;
2900 Lisp_Object tail;
2901 EMACS_INT reg94, reg96;
2902 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2903 int max_charset_id;
2905 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2906 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2907 && ! EQ (charset_list, Viso_2022_charset_list))
2909 charset_list = Viso_2022_charset_list;
2910 ASET (attrs, coding_attr_charset_list, charset_list);
2911 ASET (attrs, coding_attr_safe_charsets, Qnil);
2914 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2915 return;
2917 max_charset_id = 0;
2918 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2920 int id = XINT (XCAR (tail));
2921 if (max_charset_id < id)
2922 max_charset_id = id;
2925 safe_charsets = make_uninit_string (max_charset_id + 1);
2926 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2927 request = AREF (attrs, coding_attr_iso_request);
2928 reg_usage = AREF (attrs, coding_attr_iso_usage);
2929 reg94 = XINT (XCAR (reg_usage));
2930 reg96 = XINT (XCDR (reg_usage));
2932 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2934 Lisp_Object id;
2935 Lisp_Object reg;
2936 struct charset *charset;
2938 id = XCAR (tail);
2939 charset = CHARSET_FROM_ID (XINT (id));
2940 reg = Fcdr (Fassq (id, request));
2941 if (! NILP (reg))
2942 SSET (safe_charsets, XINT (id), XINT (reg));
2943 else if (charset->iso_chars_96)
2945 if (reg96 < 4)
2946 SSET (safe_charsets, XINT (id), reg96);
2948 else
2950 if (reg94 < 4)
2951 SSET (safe_charsets, XINT (id), reg94);
2954 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2958 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2959 Return true if a text is encoded in one of ISO-2022 based coding
2960 systems. */
2962 static bool
2963 detect_coding_iso_2022 (struct coding_system *coding,
2964 struct coding_detection_info *detect_info)
2966 const unsigned char *src = coding->source, *src_base = src;
2967 const unsigned char *src_end = coding->source + coding->src_bytes;
2968 bool multibytep = coding->src_multibyte;
2969 bool single_shifting = 0;
2970 int id;
2971 int c, c1;
2972 ptrdiff_t consumed_chars = 0;
2973 int i;
2974 int rejected = 0;
2975 int found = 0;
2976 int composition_count = -1;
2978 detect_info->checked |= CATEGORY_MASK_ISO;
2980 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2982 struct coding_system *this = &(coding_categories[i]);
2983 Lisp_Object attrs, val;
2985 if (this->id < 0)
2986 continue;
2987 attrs = CODING_ID_ATTRS (this->id);
2988 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2989 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2990 setup_iso_safe_charsets (attrs);
2991 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2992 this->max_charset_id = SCHARS (val) - 1;
2993 this->safe_charsets = SDATA (val);
2996 /* A coding system of this category is always ASCII compatible. */
2997 src += coding->head_ascii;
2999 while (rejected != CATEGORY_MASK_ISO)
3001 src_base = src;
3002 ONE_MORE_BYTE (c);
3003 switch (c)
3005 case ISO_CODE_ESC:
3006 if (inhibit_iso_escape_detection)
3007 break;
3008 single_shifting = 0;
3009 ONE_MORE_BYTE (c);
3010 if (c == 'N' || c == 'O')
3012 /* ESC <Fe> for SS2 or SS3. */
3013 single_shifting = 1;
3014 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3016 else if (c == '1')
3018 /* End of composition. */
3019 if (composition_count < 0
3020 || composition_count > MAX_COMPOSITION_COMPONENTS)
3021 /* Invalid */
3022 break;
3023 composition_count = -1;
3024 found |= CATEGORY_MASK_ISO;
3026 else if (c >= '0' && c <= '4')
3028 /* ESC <Fp> for start/end composition. */
3029 composition_count = 0;
3031 else
3033 if (c >= '(' && c <= '/')
3035 /* Designation sequence for a charset of dimension 1. */
3036 ONE_MORE_BYTE (c1);
3037 if (c1 < ' ' || c1 >= 0x80
3038 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3039 /* Invalid designation sequence. Just ignore. */
3040 break;
3042 else if (c == '$')
3044 /* Designation sequence for a charset of dimension 2. */
3045 ONE_MORE_BYTE (c);
3046 if (c >= '@' && c <= 'B')
3047 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3048 id = iso_charset_table[1][0][c];
3049 else if (c >= '(' && c <= '/')
3051 ONE_MORE_BYTE (c1);
3052 if (c1 < ' ' || c1 >= 0x80
3053 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3054 /* Invalid designation sequence. Just ignore. */
3055 break;
3057 else
3058 /* Invalid designation sequence. Just ignore it. */
3059 break;
3061 else
3063 /* Invalid escape sequence. Just ignore it. */
3064 break;
3067 /* We found a valid designation sequence for CHARSET. */
3068 rejected |= CATEGORY_MASK_ISO_8BIT;
3069 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3070 id))
3071 found |= CATEGORY_MASK_ISO_7;
3072 else
3073 rejected |= CATEGORY_MASK_ISO_7;
3074 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3075 id))
3076 found |= CATEGORY_MASK_ISO_7_TIGHT;
3077 else
3078 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3079 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3080 id))
3081 found |= CATEGORY_MASK_ISO_7_ELSE;
3082 else
3083 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3084 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3085 id))
3086 found |= CATEGORY_MASK_ISO_8_ELSE;
3087 else
3088 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3090 break;
3092 case ISO_CODE_SO:
3093 case ISO_CODE_SI:
3094 /* Locking shift out/in. */
3095 if (inhibit_iso_escape_detection)
3096 break;
3097 single_shifting = 0;
3098 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3099 break;
3101 case ISO_CODE_CSI:
3102 /* Control sequence introducer. */
3103 single_shifting = 0;
3104 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3105 found |= CATEGORY_MASK_ISO_8_ELSE;
3106 goto check_extra_latin;
3108 case ISO_CODE_SS2:
3109 case ISO_CODE_SS3:
3110 /* Single shift. */
3111 if (inhibit_iso_escape_detection)
3112 break;
3113 single_shifting = 0;
3114 rejected |= CATEGORY_MASK_ISO_7BIT;
3115 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116 & CODING_ISO_FLAG_SINGLE_SHIFT)
3118 found |= CATEGORY_MASK_ISO_8_1;
3119 single_shifting = 1;
3121 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3122 & CODING_ISO_FLAG_SINGLE_SHIFT)
3124 found |= CATEGORY_MASK_ISO_8_2;
3125 single_shifting = 1;
3127 if (single_shifting)
3128 break;
3129 goto check_extra_latin;
3131 default:
3132 if (c < 0)
3133 continue;
3134 if (c < 0x80)
3136 if (composition_count >= 0)
3137 composition_count++;
3138 single_shifting = 0;
3139 break;
3141 if (c >= 0xA0)
3143 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3144 found |= CATEGORY_MASK_ISO_8_1;
3145 /* Check the length of succeeding codes of the range
3146 0xA0..0FF. If the byte length is even, we include
3147 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3148 only when we are not single shifting. */
3149 if (! single_shifting
3150 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3152 int len = 1;
3153 while (src < src_end)
3155 src_base = src;
3156 ONE_MORE_BYTE (c);
3157 if (c < 0xA0)
3159 src = src_base;
3160 break;
3162 len++;
3165 if (len & 1 && src < src_end)
3167 rejected |= CATEGORY_MASK_ISO_8_2;
3168 if (composition_count >= 0)
3169 composition_count += len;
3171 else
3173 found |= CATEGORY_MASK_ISO_8_2;
3174 if (composition_count >= 0)
3175 composition_count += len / 2;
3178 break;
3180 check_extra_latin:
3181 if (! VECTORP (Vlatin_extra_code_table)
3182 || NILP (AREF (Vlatin_extra_code_table, c)))
3184 rejected = CATEGORY_MASK_ISO;
3185 break;
3187 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3188 & CODING_ISO_FLAG_LATIN_EXTRA)
3189 found |= CATEGORY_MASK_ISO_8_1;
3190 else
3191 rejected |= CATEGORY_MASK_ISO_8_1;
3192 rejected |= CATEGORY_MASK_ISO_8_2;
3193 break;
3196 detect_info->rejected |= CATEGORY_MASK_ISO;
3197 return 0;
3199 no_more_source:
3200 detect_info->rejected |= rejected;
3201 detect_info->found |= (found & ~rejected);
3202 return 1;
3206 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3207 escape sequence should be kept. */
3208 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3209 do { \
3210 int id, prev; \
3212 if (final < '0' || final >= 128 \
3213 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3214 || !SAFE_CHARSET_P (coding, id)) \
3216 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3217 chars_96 = -1; \
3218 break; \
3220 prev = CODING_ISO_DESIGNATION (coding, reg); \
3221 if (id == charset_jisx0201_roman) \
3223 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3224 id = charset_ascii; \
3226 else if (id == charset_jisx0208_1978) \
3228 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3229 id = charset_jisx0208; \
3231 CODING_ISO_DESIGNATION (coding, reg) = id; \
3232 /* If there was an invalid designation to REG previously, and this \
3233 designation is ASCII to REG, we should keep this designation \
3234 sequence. */ \
3235 if (prev == -2 && id == charset_ascii) \
3236 chars_96 = -1; \
3237 } while (0)
3240 /* Handle these composition sequence (ALT: alternate char):
3242 (1) relative composition: ESC 0 CHAR ... ESC 1
3243 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3244 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3245 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3247 When the start sequence (ESC 0/2/3/4) is found, this annotation
3248 header is produced.
3250 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3252 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3253 produced until the end sequence (ESC 1) is found:
3255 (1) CHAR ... CHAR
3256 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3257 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3258 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3260 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3261 annotation header is updated as below:
3263 (1) LENGTH: unchanged, NCHARS: number of CHARs
3264 (2) LENGTH: unchanged, NCHARS: number of CHARs
3265 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3266 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3268 If an error is found while composing, the annotation header is
3269 changed to:
3271 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3273 and the sequence [ -2 DECODED-RULE ] is changed to the original
3274 byte sequence as below:
3275 o the original byte sequence is B: [ B -1 ]
3276 o the original byte sequence is B1 B2: [ B1 B2 ]
3277 and the sequence [ -1 -1 ] is changed to the original byte
3278 sequence:
3279 [ ESC '0' ]
3282 /* Decode a composition rule C1 and maybe one more byte from the
3283 source, and set RULE to the encoded composition rule. If the rule
3284 is invalid, goto invalid_code. */
3286 #define DECODE_COMPOSITION_RULE(rule) \
3287 do { \
3288 rule = c1 - 32; \
3289 if (rule < 0) \
3290 goto invalid_code; \
3291 if (rule < 81) /* old format (before ver.21) */ \
3293 int gref = (rule) / 9; \
3294 int nref = (rule) % 9; \
3295 if (gref == 4) gref = 10; \
3296 if (nref == 4) nref = 10; \
3297 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3299 else /* new format (after ver.21) */ \
3301 int b; \
3303 ONE_MORE_BYTE (b); \
3304 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3305 goto invalid_code; \
3306 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3307 rule += 0x100; /* Distinguish it from the old format. */ \
3309 } while (0)
3311 #define ENCODE_COMPOSITION_RULE(rule) \
3312 do { \
3313 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3315 if (rule < 0x100) /* old format */ \
3317 if (gref == 10) gref = 4; \
3318 if (nref == 10) nref = 4; \
3319 charbuf[idx] = 32 + gref * 9 + nref; \
3320 charbuf[idx + 1] = -1; \
3321 new_chars++; \
3323 else /* new format */ \
3325 charbuf[idx] = 32 + 81 + gref; \
3326 charbuf[idx + 1] = 32 + nref; \
3327 new_chars += 2; \
3329 } while (0)
3331 /* Finish the current composition as invalid. */
3333 static int
3334 finish_composition (int *charbuf, struct composition_status *cmp_status)
3336 int idx = - cmp_status->length;
3337 int new_chars;
3339 /* Recover the original ESC sequence */
3340 charbuf[idx++] = ISO_CODE_ESC;
3341 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3342 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3343 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3344 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3345 : '4');
3346 charbuf[idx++] = -2;
3347 charbuf[idx++] = 0;
3348 charbuf[idx++] = -1;
3349 new_chars = cmp_status->nchars;
3350 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3351 for (; idx < 0; idx++)
3353 int elt = charbuf[idx];
3355 if (elt == -2)
3357 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3358 idx++;
3360 else if (elt == -1)
3362 charbuf[idx++] = ISO_CODE_ESC;
3363 charbuf[idx] = '0';
3364 new_chars += 2;
3367 cmp_status->state = COMPOSING_NO;
3368 return new_chars;
3371 /* If characters are under composition, finish the composition. */
3372 #define MAYBE_FINISH_COMPOSITION() \
3373 do { \
3374 if (cmp_status->state != COMPOSING_NO) \
3375 char_offset += finish_composition (charbuf, cmp_status); \
3376 } while (0)
3378 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3380 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3381 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3382 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3383 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3385 Produce this annotation sequence now:
3387 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3390 #define DECODE_COMPOSITION_START(c1) \
3391 do { \
3392 if (c1 == '0' \
3393 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3394 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3395 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3396 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3398 *charbuf++ = -1; \
3399 *charbuf++= -1; \
3400 cmp_status->state = COMPOSING_CHAR; \
3401 cmp_status->length += 2; \
3403 else \
3405 MAYBE_FINISH_COMPOSITION (); \
3406 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3407 : c1 == '2' ? COMPOSITION_WITH_RULE \
3408 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3409 : COMPOSITION_WITH_RULE_ALTCHARS); \
3410 cmp_status->state \
3411 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3412 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3413 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3414 cmp_status->nchars = cmp_status->ncomps = 0; \
3415 coding->annotated = 1; \
3417 } while (0)
3420 /* Handle composition end sequence ESC 1. */
3422 #define DECODE_COMPOSITION_END() \
3423 do { \
3424 if (cmp_status->nchars == 0 \
3425 || ((cmp_status->state == COMPOSING_CHAR) \
3426 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3428 MAYBE_FINISH_COMPOSITION (); \
3429 goto invalid_code; \
3431 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3432 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3433 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3434 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3435 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3436 char_offset += cmp_status->nchars; \
3437 cmp_status->state = COMPOSING_NO; \
3438 } while (0)
3440 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3442 #define STORE_COMPOSITION_RULE(rule) \
3443 do { \
3444 *charbuf++ = -2; \
3445 *charbuf++ = rule; \
3446 cmp_status->length += 2; \
3447 cmp_status->state--; \
3448 } while (0)
3450 /* Store a composed char or a component char C in charbuf, and update
3451 cmp_status. */
3453 #define STORE_COMPOSITION_CHAR(c) \
3454 do { \
3455 *charbuf++ = (c); \
3456 cmp_status->length++; \
3457 if (cmp_status->state == COMPOSING_CHAR) \
3458 cmp_status->nchars++; \
3459 else \
3460 cmp_status->ncomps++; \
3461 if (cmp_status->method == COMPOSITION_WITH_RULE \
3462 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3463 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3464 cmp_status->state++; \
3465 } while (0)
3468 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3470 static void
3471 decode_coding_iso_2022 (struct coding_system *coding)
3473 const unsigned char *src = coding->source + coding->consumed;
3474 const unsigned char *src_end = coding->source + coding->src_bytes;
3475 const unsigned char *src_base;
3476 int *charbuf = coding->charbuf + coding->charbuf_used;
3477 /* We may produce two annotations (charset and composition) in one
3478 loop and one more charset annotation at the end. */
3479 int *charbuf_end
3480 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3481 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3482 bool multibytep = coding->src_multibyte;
3483 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3484 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3485 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3486 int charset_id_2, charset_id_3;
3487 struct charset *charset;
3488 int c;
3489 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3490 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3491 ptrdiff_t char_offset = coding->produced_char;
3492 ptrdiff_t last_offset = char_offset;
3493 int last_id = charset_ascii;
3494 bool eol_dos
3495 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3496 int byte_after_cr = -1;
3497 int i;
3499 setup_iso_safe_charsets (attrs);
3500 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3502 if (cmp_status->state != COMPOSING_NO)
3504 if (charbuf_end - charbuf < cmp_status->length)
3505 emacs_abort ();
3506 for (i = 0; i < cmp_status->length; i++)
3507 *charbuf++ = cmp_status->carryover[i];
3508 coding->annotated = 1;
3511 while (1)
3513 int c1, c2, c3;
3515 src_base = src;
3516 consumed_chars_base = consumed_chars;
3518 if (charbuf >= charbuf_end)
3520 if (byte_after_cr >= 0)
3521 src_base--;
3522 break;
3525 if (byte_after_cr >= 0)
3526 c1 = byte_after_cr, byte_after_cr = -1;
3527 else
3528 ONE_MORE_BYTE (c1);
3529 if (c1 < 0)
3530 goto invalid_code;
3532 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3534 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3535 char_offset++;
3536 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3537 continue;
3540 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3542 if (c1 == ISO_CODE_ESC)
3544 if (src + 1 >= src_end)
3545 goto no_more_source;
3546 *charbuf++ = ISO_CODE_ESC;
3547 char_offset++;
3548 if (src[0] == '%' && src[1] == '@')
3550 src += 2;
3551 consumed_chars += 2;
3552 char_offset += 2;
3553 /* We are sure charbuf can contain two more chars. */
3554 *charbuf++ = '%';
3555 *charbuf++ = '@';
3556 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3559 else
3561 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3562 char_offset++;
3564 continue;
3567 if ((cmp_status->state == COMPOSING_RULE
3568 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3569 && c1 != ISO_CODE_ESC)
3571 int rule;
3573 DECODE_COMPOSITION_RULE (rule);
3574 STORE_COMPOSITION_RULE (rule);
3575 continue;
3578 /* We produce at most one character. */
3579 switch (iso_code_class [c1])
3581 case ISO_0x20_or_0x7F:
3582 if (charset_id_0 < 0
3583 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3584 /* This is SPACE or DEL. */
3585 charset = CHARSET_FROM_ID (charset_ascii);
3586 else
3587 charset = CHARSET_FROM_ID (charset_id_0);
3588 break;
3590 case ISO_graphic_plane_0:
3591 if (charset_id_0 < 0)
3592 charset = CHARSET_FROM_ID (charset_ascii);
3593 else
3594 charset = CHARSET_FROM_ID (charset_id_0);
3595 break;
3597 case ISO_0xA0_or_0xFF:
3598 if (charset_id_1 < 0
3599 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3600 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3601 goto invalid_code;
3602 /* This is a graphic character, we fall down ... */
3604 case ISO_graphic_plane_1:
3605 if (charset_id_1 < 0)
3606 goto invalid_code;
3607 charset = CHARSET_FROM_ID (charset_id_1);
3608 break;
3610 case ISO_control_0:
3611 if (eol_dos && c1 == '\r')
3612 ONE_MORE_BYTE (byte_after_cr);
3613 MAYBE_FINISH_COMPOSITION ();
3614 charset = CHARSET_FROM_ID (charset_ascii);
3615 break;
3617 case ISO_control_1:
3618 goto invalid_code;
3620 case ISO_shift_out:
3621 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3622 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3623 goto invalid_code;
3624 CODING_ISO_INVOCATION (coding, 0) = 1;
3625 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3626 continue;
3628 case ISO_shift_in:
3629 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3630 goto invalid_code;
3631 CODING_ISO_INVOCATION (coding, 0) = 0;
3632 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633 continue;
3635 case ISO_single_shift_2_7:
3636 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3637 goto invalid_code;
3638 case ISO_single_shift_2:
3639 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3640 goto invalid_code;
3641 /* SS2 is handled as an escape sequence of ESC 'N' */
3642 c1 = 'N';
3643 goto label_escape_sequence;
3645 case ISO_single_shift_3:
3646 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3647 goto invalid_code;
3648 /* SS2 is handled as an escape sequence of ESC 'O' */
3649 c1 = 'O';
3650 goto label_escape_sequence;
3652 case ISO_control_sequence_introducer:
3653 /* CSI is handled as an escape sequence of ESC '[' ... */
3654 c1 = '[';
3655 goto label_escape_sequence;
3657 case ISO_escape:
3658 ONE_MORE_BYTE (c1);
3659 label_escape_sequence:
3660 /* Escape sequences handled here are invocation,
3661 designation, direction specification, and character
3662 composition specification. */
3663 switch (c1)
3665 case '&': /* revision of following character set */
3666 ONE_MORE_BYTE (c1);
3667 if (!(c1 >= '@' && c1 <= '~'))
3668 goto invalid_code;
3669 ONE_MORE_BYTE (c1);
3670 if (c1 != ISO_CODE_ESC)
3671 goto invalid_code;
3672 ONE_MORE_BYTE (c1);
3673 goto label_escape_sequence;
3675 case '$': /* designation of 2-byte character set */
3676 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3677 goto invalid_code;
3679 int reg, chars96;
3681 ONE_MORE_BYTE (c1);
3682 if (c1 >= '@' && c1 <= 'B')
3683 { /* designation of JISX0208.1978, GB2312.1980,
3684 or JISX0208.1980 */
3685 reg = 0, chars96 = 0;
3687 else if (c1 >= 0x28 && c1 <= 0x2B)
3688 { /* designation of DIMENSION2_CHARS94 character set */
3689 reg = c1 - 0x28, chars96 = 0;
3690 ONE_MORE_BYTE (c1);
3692 else if (c1 >= 0x2C && c1 <= 0x2F)
3693 { /* designation of DIMENSION2_CHARS96 character set */
3694 reg = c1 - 0x2C, chars96 = 1;
3695 ONE_MORE_BYTE (c1);
3697 else
3698 goto invalid_code;
3699 DECODE_DESIGNATION (reg, 2, chars96, c1);
3700 /* We must update these variables now. */
3701 if (reg == 0)
3702 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703 else if (reg == 1)
3704 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3705 if (chars96 < 0)
3706 goto invalid_code;
3708 continue;
3710 case 'n': /* invocation of locking-shift-2 */
3711 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713 goto invalid_code;
3714 CODING_ISO_INVOCATION (coding, 0) = 2;
3715 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716 continue;
3718 case 'o': /* invocation of locking-shift-3 */
3719 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3721 goto invalid_code;
3722 CODING_ISO_INVOCATION (coding, 0) = 3;
3723 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724 continue;
3726 case 'N': /* invocation of single-shift-2 */
3727 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3728 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3729 goto invalid_code;
3730 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3731 if (charset_id_2 < 0)
3732 charset = CHARSET_FROM_ID (charset_ascii);
3733 else
3734 charset = CHARSET_FROM_ID (charset_id_2);
3735 ONE_MORE_BYTE (c1);
3736 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3737 goto invalid_code;
3738 break;
3740 case 'O': /* invocation of single-shift-3 */
3741 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3742 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3743 goto invalid_code;
3744 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3745 if (charset_id_3 < 0)
3746 charset = CHARSET_FROM_ID (charset_ascii);
3747 else
3748 charset = CHARSET_FROM_ID (charset_id_3);
3749 ONE_MORE_BYTE (c1);
3750 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3751 goto invalid_code;
3752 break;
3754 case '0': case '2': case '3': case '4': /* start composition */
3755 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3756 goto invalid_code;
3757 if (last_id != charset_ascii)
3759 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3760 last_id = charset_ascii;
3761 last_offset = char_offset;
3763 DECODE_COMPOSITION_START (c1);
3764 continue;
3766 case '1': /* end composition */
3767 if (cmp_status->state == COMPOSING_NO)
3768 goto invalid_code;
3769 DECODE_COMPOSITION_END ();
3770 continue;
3772 case '[': /* specification of direction */
3773 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3774 goto invalid_code;
3775 /* For the moment, nested direction is not supported.
3776 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3777 left-to-right, and nonzero means right-to-left. */
3778 ONE_MORE_BYTE (c1);
3779 switch (c1)
3781 case ']': /* end of the current direction */
3782 coding->mode &= ~CODING_MODE_DIRECTION;
3784 case '0': /* end of the current direction */
3785 case '1': /* start of left-to-right direction */
3786 ONE_MORE_BYTE (c1);
3787 if (c1 == ']')
3788 coding->mode &= ~CODING_MODE_DIRECTION;
3789 else
3790 goto invalid_code;
3791 break;
3793 case '2': /* start of right-to-left direction */
3794 ONE_MORE_BYTE (c1);
3795 if (c1 == ']')
3796 coding->mode |= CODING_MODE_DIRECTION;
3797 else
3798 goto invalid_code;
3799 break;
3801 default:
3802 goto invalid_code;
3804 continue;
3806 case '%':
3807 ONE_MORE_BYTE (c1);
3808 if (c1 == '/')
3810 /* CTEXT extended segment:
3811 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3812 We keep these bytes as is for the moment.
3813 They may be decoded by post-read-conversion. */
3814 int dim, M, L;
3815 int size;
3817 ONE_MORE_BYTE (dim);
3818 if (dim < '0' || dim > '4')
3819 goto invalid_code;
3820 ONE_MORE_BYTE (M);
3821 if (M < 128)
3822 goto invalid_code;
3823 ONE_MORE_BYTE (L);
3824 if (L < 128)
3825 goto invalid_code;
3826 size = ((M - 128) * 128) + (L - 128);
3827 if (charbuf + 6 > charbuf_end)
3828 goto break_loop;
3829 *charbuf++ = ISO_CODE_ESC;
3830 *charbuf++ = '%';
3831 *charbuf++ = '/';
3832 *charbuf++ = dim;
3833 *charbuf++ = BYTE8_TO_CHAR (M);
3834 *charbuf++ = BYTE8_TO_CHAR (L);
3835 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3837 else if (c1 == 'G')
3839 /* XFree86 extension for embedding UTF-8 in CTEXT:
3840 ESC % G --UTF-8-BYTES-- ESC % @
3841 We keep these bytes as is for the moment.
3842 They may be decoded by post-read-conversion. */
3843 if (charbuf + 3 > charbuf_end)
3844 goto break_loop;
3845 *charbuf++ = ISO_CODE_ESC;
3846 *charbuf++ = '%';
3847 *charbuf++ = 'G';
3848 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3850 else
3851 goto invalid_code;
3852 continue;
3853 break;
3855 default:
3856 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3857 goto invalid_code;
3859 int reg, chars96;
3861 if (c1 >= 0x28 && c1 <= 0x2B)
3862 { /* designation of DIMENSION1_CHARS94 character set */
3863 reg = c1 - 0x28, chars96 = 0;
3864 ONE_MORE_BYTE (c1);
3866 else if (c1 >= 0x2C && c1 <= 0x2F)
3867 { /* designation of DIMENSION1_CHARS96 character set */
3868 reg = c1 - 0x2C, chars96 = 1;
3869 ONE_MORE_BYTE (c1);
3871 else
3872 goto invalid_code;
3873 DECODE_DESIGNATION (reg, 1, chars96, c1);
3874 /* We must update these variables now. */
3875 if (reg == 0)
3876 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3877 else if (reg == 1)
3878 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3879 if (chars96 < 0)
3880 goto invalid_code;
3882 continue;
3884 break;
3886 default:
3887 emacs_abort ();
3890 if (cmp_status->state == COMPOSING_NO
3891 && charset->id != charset_ascii
3892 && last_id != charset->id)
3894 if (last_id != charset_ascii)
3895 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3896 last_id = charset->id;
3897 last_offset = char_offset;
3900 /* Now we know CHARSET and 1st position code C1 of a character.
3901 Produce a decoded character while getting 2nd and 3rd
3902 position codes C2, C3 if necessary. */
3903 if (CHARSET_DIMENSION (charset) > 1)
3905 ONE_MORE_BYTE (c2);
3906 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3907 || ((c1 & 0x80) != (c2 & 0x80)))
3908 /* C2 is not in a valid range. */
3909 goto invalid_code;
3910 if (CHARSET_DIMENSION (charset) == 2)
3911 c1 = (c1 << 8) | c2;
3912 else
3914 ONE_MORE_BYTE (c3);
3915 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3916 || ((c1 & 0x80) != (c3 & 0x80)))
3917 /* C3 is not in a valid range. */
3918 goto invalid_code;
3919 c1 = (c1 << 16) | (c2 << 8) | c2;
3922 c1 &= 0x7F7F7F;
3923 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3924 if (c < 0)
3926 MAYBE_FINISH_COMPOSITION ();
3927 for (; src_base < src; src_base++, char_offset++)
3929 if (ASCII_BYTE_P (*src_base))
3930 *charbuf++ = *src_base;
3931 else
3932 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3935 else if (cmp_status->state == COMPOSING_NO)
3937 *charbuf++ = c;
3938 char_offset++;
3940 else if ((cmp_status->state == COMPOSING_CHAR
3941 ? cmp_status->nchars
3942 : cmp_status->ncomps)
3943 >= MAX_COMPOSITION_COMPONENTS)
3945 /* Too long composition. */
3946 MAYBE_FINISH_COMPOSITION ();
3947 *charbuf++ = c;
3948 char_offset++;
3950 else
3951 STORE_COMPOSITION_CHAR (c);
3952 continue;
3954 invalid_code:
3955 MAYBE_FINISH_COMPOSITION ();
3956 src = src_base;
3957 consumed_chars = consumed_chars_base;
3958 ONE_MORE_BYTE (c);
3959 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3960 char_offset++;
3961 coding->errors++;
3962 /* Reset the invocation and designation status to the safest
3963 one; i.e. designate ASCII to the graphic register 0, and
3964 invoke that register to the graphic plane 0. This typically
3965 helps the case that an designation sequence for ASCII "ESC (
3966 B" is somehow broken (e.g. broken by a newline). */
3967 CODING_ISO_INVOCATION (coding, 0) = 0;
3968 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3969 charset_id_0 = charset_ascii;
3970 continue;
3972 break_loop:
3973 break;
3976 no_more_source:
3977 if (cmp_status->state != COMPOSING_NO)
3979 if (coding->mode & CODING_MODE_LAST_BLOCK)
3980 MAYBE_FINISH_COMPOSITION ();
3981 else
3983 charbuf -= cmp_status->length;
3984 for (i = 0; i < cmp_status->length; i++)
3985 cmp_status->carryover[i] = charbuf[i];
3988 else if (last_id != charset_ascii)
3989 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3990 coding->consumed_char += consumed_chars_base;
3991 coding->consumed = src_base - coding->source;
3992 coding->charbuf_used = charbuf - coding->charbuf;
3996 /* ISO2022 encoding stuff. */
3999 It is not enough to say just "ISO2022" on encoding, we have to
4000 specify more details. In Emacs, each coding system of ISO2022
4001 variant has the following specifications:
4002 1. Initial designation to G0 thru G3.
4003 2. Allows short-form designation?
4004 3. ASCII should be designated to G0 before control characters?
4005 4. ASCII should be designated to G0 at end of line?
4006 5. 7-bit environment or 8-bit environment?
4007 6. Use locking-shift?
4008 7. Use Single-shift?
4009 And the following two are only for Japanese:
4010 8. Use ASCII in place of JIS0201-1976-Roman?
4011 9. Use JISX0208-1983 in place of JISX0208-1978?
4012 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4013 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4014 details.
4017 /* Produce codes (escape sequence) for designating CHARSET to graphic
4018 register REG at DST, and increment DST. If <final-char> of CHARSET is
4019 '@', 'A', or 'B' and the coding system CODING allows, produce
4020 designation sequence of short-form. */
4022 #define ENCODE_DESIGNATION(charset, reg, coding) \
4023 do { \
4024 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4025 const char *intermediate_char_94 = "()*+"; \
4026 const char *intermediate_char_96 = ",-./"; \
4027 int revision = -1; \
4029 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4030 revision = CHARSET_ISO_REVISION (charset); \
4032 if (revision >= 0) \
4034 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4035 EMIT_ONE_BYTE ('@' + revision); \
4037 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4038 if (CHARSET_DIMENSION (charset) == 1) \
4040 int b; \
4041 if (! CHARSET_ISO_CHARS_96 (charset)) \
4042 b = intermediate_char_94[reg]; \
4043 else \
4044 b = intermediate_char_96[reg]; \
4045 EMIT_ONE_ASCII_BYTE (b); \
4047 else \
4049 EMIT_ONE_ASCII_BYTE ('$'); \
4050 if (! CHARSET_ISO_CHARS_96 (charset)) \
4052 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4053 || reg != 0 \
4054 || final_char < '@' || final_char > 'B') \
4055 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4057 else \
4058 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4060 EMIT_ONE_ASCII_BYTE (final_char); \
4062 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4063 } while (0)
4066 /* The following two macros produce codes (control character or escape
4067 sequence) for ISO2022 single-shift functions (single-shift-2 and
4068 single-shift-3). */
4070 #define ENCODE_SINGLE_SHIFT_2 \
4071 do { \
4072 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4073 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4074 else \
4075 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4076 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4077 } while (0)
4080 #define ENCODE_SINGLE_SHIFT_3 \
4081 do { \
4082 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4083 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4084 else \
4085 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4086 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4087 } while (0)
4090 /* The following four macros produce codes (control character or
4091 escape sequence) for ISO2022 locking-shift functions (shift-in,
4092 shift-out, locking-shift-2, and locking-shift-3). */
4094 #define ENCODE_SHIFT_IN \
4095 do { \
4096 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4097 CODING_ISO_INVOCATION (coding, 0) = 0; \
4098 } while (0)
4101 #define ENCODE_SHIFT_OUT \
4102 do { \
4103 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4104 CODING_ISO_INVOCATION (coding, 0) = 1; \
4105 } while (0)
4108 #define ENCODE_LOCKING_SHIFT_2 \
4109 do { \
4110 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4111 CODING_ISO_INVOCATION (coding, 0) = 2; \
4112 } while (0)
4115 #define ENCODE_LOCKING_SHIFT_3 \
4116 do { \
4117 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4118 CODING_ISO_INVOCATION (coding, 0) = 3; \
4119 } while (0)
4122 /* Produce codes for a DIMENSION1 character whose character set is
4123 CHARSET and whose position-code is C1. Designation and invocation
4124 sequences are also produced in advance if necessary. */
4126 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4127 do { \
4128 int id = CHARSET_ID (charset); \
4130 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4131 && id == charset_ascii) \
4133 id = charset_jisx0201_roman; \
4134 charset = CHARSET_FROM_ID (id); \
4137 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4139 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4140 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4141 else \
4142 EMIT_ONE_BYTE (c1 | 0x80); \
4143 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4144 break; \
4146 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4148 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4149 break; \
4151 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4153 EMIT_ONE_BYTE (c1 | 0x80); \
4154 break; \
4156 else \
4157 /* Since CHARSET is not yet invoked to any graphic planes, we \
4158 must invoke it, or, at first, designate it to some graphic \
4159 register. Then repeat the loop to actually produce the \
4160 character. */ \
4161 dst = encode_invocation_designation (charset, coding, dst, \
4162 &produced_chars); \
4163 } while (1)
4166 /* Produce codes for a DIMENSION2 character whose character set is
4167 CHARSET and whose position-codes are C1 and C2. Designation and
4168 invocation codes are also produced in advance if necessary. */
4170 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4171 do { \
4172 int id = CHARSET_ID (charset); \
4174 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4175 && id == charset_jisx0208) \
4177 id = charset_jisx0208_1978; \
4178 charset = CHARSET_FROM_ID (id); \
4181 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4183 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4184 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4185 else \
4186 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4187 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4188 break; \
4190 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4192 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4193 break; \
4195 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4197 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4198 break; \
4200 else \
4201 /* Since CHARSET is not yet invoked to any graphic planes, we \
4202 must invoke it, or, at first, designate it to some graphic \
4203 register. Then repeat the loop to actually produce the \
4204 character. */ \
4205 dst = encode_invocation_designation (charset, coding, dst, \
4206 &produced_chars); \
4207 } while (1)
4210 #define ENCODE_ISO_CHARACTER(charset, c) \
4211 do { \
4212 unsigned code; \
4213 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4215 if (CHARSET_DIMENSION (charset) == 1) \
4216 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4217 else \
4218 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4219 } while (0)
4222 /* Produce designation and invocation codes at a place pointed by DST
4223 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4224 Return new DST. */
4226 static unsigned char *
4227 encode_invocation_designation (struct charset *charset,
4228 struct coding_system *coding,
4229 unsigned char *dst, ptrdiff_t *p_nchars)
4231 bool multibytep = coding->dst_multibyte;
4232 ptrdiff_t produced_chars = *p_nchars;
4233 int reg; /* graphic register number */
4234 int id = CHARSET_ID (charset);
4236 /* At first, check designations. */
4237 for (reg = 0; reg < 4; reg++)
4238 if (id == CODING_ISO_DESIGNATION (coding, reg))
4239 break;
4241 if (reg >= 4)
4243 /* CHARSET is not yet designated to any graphic registers. */
4244 /* At first check the requested designation. */
4245 reg = CODING_ISO_REQUEST (coding, id);
4246 if (reg < 0)
4247 /* Since CHARSET requests no special designation, designate it
4248 to graphic register 0. */
4249 reg = 0;
4251 ENCODE_DESIGNATION (charset, reg, coding);
4254 if (CODING_ISO_INVOCATION (coding, 0) != reg
4255 && CODING_ISO_INVOCATION (coding, 1) != reg)
4257 /* Since the graphic register REG is not invoked to any graphic
4258 planes, invoke it to graphic plane 0. */
4259 switch (reg)
4261 case 0: /* graphic register 0 */
4262 ENCODE_SHIFT_IN;
4263 break;
4265 case 1: /* graphic register 1 */
4266 ENCODE_SHIFT_OUT;
4267 break;
4269 case 2: /* graphic register 2 */
4270 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4271 ENCODE_SINGLE_SHIFT_2;
4272 else
4273 ENCODE_LOCKING_SHIFT_2;
4274 break;
4276 case 3: /* graphic register 3 */
4277 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4278 ENCODE_SINGLE_SHIFT_3;
4279 else
4280 ENCODE_LOCKING_SHIFT_3;
4281 break;
4285 *p_nchars = produced_chars;
4286 return dst;
4290 /* Produce codes for designation and invocation to reset the graphic
4291 planes and registers to initial state. */
4292 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4293 do { \
4294 int reg; \
4295 struct charset *charset; \
4297 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4298 ENCODE_SHIFT_IN; \
4299 for (reg = 0; reg < 4; reg++) \
4300 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4301 && (CODING_ISO_DESIGNATION (coding, reg) \
4302 != CODING_ISO_INITIAL (coding, reg))) \
4304 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4305 ENCODE_DESIGNATION (charset, reg, coding); \
4307 } while (0)
4310 /* Produce designation sequences of charsets in the line started from
4311 CHARBUF to a place pointed by DST, and return the number of
4312 produced bytes. DST should not directly point a buffer text area
4313 which may be relocated by char_charset call.
4315 If the current block ends before any end-of-line, we may fail to
4316 find all the necessary designations. */
4318 static ptrdiff_t
4319 encode_designation_at_bol (struct coding_system *coding,
4320 int *charbuf, int *charbuf_end,
4321 unsigned char *dst)
4323 unsigned char *orig = dst;
4324 struct charset *charset;
4325 /* Table of charsets to be designated to each graphic register. */
4326 int r[4];
4327 int c, found = 0, reg;
4328 ptrdiff_t produced_chars = 0;
4329 bool multibytep = coding->dst_multibyte;
4330 Lisp_Object attrs;
4331 Lisp_Object charset_list;
4333 attrs = CODING_ID_ATTRS (coding->id);
4334 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4335 if (EQ (charset_list, Qiso_2022))
4336 charset_list = Viso_2022_charset_list;
4338 for (reg = 0; reg < 4; reg++)
4339 r[reg] = -1;
4341 while (charbuf < charbuf_end && found < 4)
4343 int id;
4345 c = *charbuf++;
4346 if (c == '\n')
4347 break;
4348 charset = char_charset (c, charset_list, NULL);
4349 id = CHARSET_ID (charset);
4350 reg = CODING_ISO_REQUEST (coding, id);
4351 if (reg >= 0 && r[reg] < 0)
4353 found++;
4354 r[reg] = id;
4358 if (found)
4360 for (reg = 0; reg < 4; reg++)
4361 if (r[reg] >= 0
4362 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4363 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4366 return dst - orig;
4369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4371 static bool
4372 encode_coding_iso_2022 (struct coding_system *coding)
4374 bool multibytep = coding->dst_multibyte;
4375 int *charbuf = coding->charbuf;
4376 int *charbuf_end = charbuf + coding->charbuf_used;
4377 unsigned char *dst = coding->destination + coding->produced;
4378 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4379 int safe_room = 16;
4380 bool bol_designation
4381 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4382 && CODING_ISO_BOL (coding));
4383 ptrdiff_t produced_chars = 0;
4384 Lisp_Object attrs, eol_type, charset_list;
4385 bool ascii_compatible;
4386 int c;
4387 int preferred_charset_id = -1;
4389 CODING_GET_INFO (coding, attrs, charset_list);
4390 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4391 if (VECTORP (eol_type))
4392 eol_type = Qunix;
4394 setup_iso_safe_charsets (attrs);
4395 /* Charset list may have been changed. */
4396 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4397 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4399 ascii_compatible
4400 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4401 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4402 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4404 while (charbuf < charbuf_end)
4406 ASSURE_DESTINATION (safe_room);
4408 if (bol_designation)
4410 /* We have to produce designation sequences if any now. */
4411 unsigned char desig_buf[16];
4412 int nbytes;
4413 ptrdiff_t offset;
4415 charset_map_loaded = 0;
4416 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4417 desig_buf);
4418 if (charset_map_loaded
4419 && (offset = coding_change_destination (coding)))
4421 dst += offset;
4422 dst_end += offset;
4424 memcpy (dst, desig_buf, nbytes);
4425 dst += nbytes;
4426 /* We are sure that designation sequences are all ASCII bytes. */
4427 produced_chars += nbytes;
4428 bol_designation = 0;
4429 ASSURE_DESTINATION (safe_room);
4432 c = *charbuf++;
4434 if (c < 0)
4436 /* Handle an annotation. */
4437 switch (*charbuf)
4439 case CODING_ANNOTATE_COMPOSITION_MASK:
4440 /* Not yet implemented. */
4441 break;
4442 case CODING_ANNOTATE_CHARSET_MASK:
4443 preferred_charset_id = charbuf[2];
4444 if (preferred_charset_id >= 0
4445 && NILP (Fmemq (make_number (preferred_charset_id),
4446 charset_list)))
4447 preferred_charset_id = -1;
4448 break;
4449 default:
4450 emacs_abort ();
4452 charbuf += -c - 1;
4453 continue;
4456 /* Now encode the character C. */
4457 if (c < 0x20 || c == 0x7F)
4459 if (c == '\n'
4460 || (c == '\r' && EQ (eol_type, Qmac)))
4462 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4463 ENCODE_RESET_PLANE_AND_REGISTER ();
4464 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4466 int i;
4468 for (i = 0; i < 4; i++)
4469 CODING_ISO_DESIGNATION (coding, i)
4470 = CODING_ISO_INITIAL (coding, i);
4472 bol_designation = ((CODING_ISO_FLAGS (coding)
4473 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4474 != 0);
4476 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4477 ENCODE_RESET_PLANE_AND_REGISTER ();
4478 EMIT_ONE_ASCII_BYTE (c);
4480 else if (ASCII_CHAR_P (c))
4482 if (ascii_compatible)
4483 EMIT_ONE_ASCII_BYTE (c);
4484 else
4486 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4487 ENCODE_ISO_CHARACTER (charset, c);
4490 else if (CHAR_BYTE8_P (c))
4492 c = CHAR_TO_BYTE8 (c);
4493 EMIT_ONE_BYTE (c);
4495 else
4497 struct charset *charset;
4499 if (preferred_charset_id >= 0)
4501 bool result;
4503 charset = CHARSET_FROM_ID (preferred_charset_id);
4504 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4505 if (! result)
4506 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4507 NULL, charset);
4509 else
4510 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4511 NULL, charset);
4512 if (!charset)
4514 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4516 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4517 charset = CHARSET_FROM_ID (charset_ascii);
4519 else
4521 c = coding->default_char;
4522 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4523 charset_list, NULL, charset);
4526 ENCODE_ISO_CHARACTER (charset, c);
4530 if (coding->mode & CODING_MODE_LAST_BLOCK
4531 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4533 ASSURE_DESTINATION (safe_room);
4534 ENCODE_RESET_PLANE_AND_REGISTER ();
4536 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4537 CODING_ISO_BOL (coding) = bol_designation;
4538 coding->produced_char += produced_chars;
4539 coding->produced = dst - coding->destination;
4540 return 0;
4544 /*** 8,9. SJIS and BIG5 handlers ***/
4546 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4547 quite widely. So, for the moment, Emacs supports them in the bare
4548 C code. But, in the future, they may be supported only by CCL. */
4550 /* SJIS is a coding system encoding three character sets: ASCII, right
4551 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4552 as is. A character of charset katakana-jisx0201 is encoded by
4553 "position-code + 0x80". A character of charset japanese-jisx0208
4554 is encoded in 2-byte but two position-codes are divided and shifted
4555 so that it fit in the range below.
4557 --- CODE RANGE of SJIS ---
4558 (character set) (range)
4559 ASCII 0x00 .. 0x7F
4560 KATAKANA-JISX0201 0xA0 .. 0xDF
4561 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4562 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4563 -------------------------------
4567 /* BIG5 is a coding system encoding two character sets: ASCII and
4568 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4569 character set and is encoded in two-byte.
4571 --- CODE RANGE of BIG5 ---
4572 (character set) (range)
4573 ASCII 0x00 .. 0x7F
4574 Big5 (1st byte) 0xA1 .. 0xFE
4575 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4576 --------------------------
4580 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581 Return true if a text is encoded in SJIS. */
4583 static bool
4584 detect_coding_sjis (struct coding_system *coding,
4585 struct coding_detection_info *detect_info)
4587 const unsigned char *src = coding->source, *src_base;
4588 const unsigned char *src_end = coding->source + coding->src_bytes;
4589 bool multibytep = coding->src_multibyte;
4590 ptrdiff_t consumed_chars = 0;
4591 int found = 0;
4592 int c;
4593 Lisp_Object attrs, charset_list;
4594 int max_first_byte_of_2_byte_code;
4596 CODING_GET_INFO (coding, attrs, charset_list);
4597 max_first_byte_of_2_byte_code
4598 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4600 detect_info->checked |= CATEGORY_MASK_SJIS;
4601 /* A coding system of this category is always ASCII compatible. */
4602 src += coding->head_ascii;
4604 while (1)
4606 src_base = src;
4607 ONE_MORE_BYTE (c);
4608 if (c < 0x80)
4609 continue;
4610 if ((c >= 0x81 && c <= 0x9F)
4611 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4613 ONE_MORE_BYTE (c);
4614 if (c < 0x40 || c == 0x7F || c > 0xFC)
4615 break;
4616 found = CATEGORY_MASK_SJIS;
4618 else if (c >= 0xA0 && c < 0xE0)
4619 found = CATEGORY_MASK_SJIS;
4620 else
4621 break;
4623 detect_info->rejected |= CATEGORY_MASK_SJIS;
4624 return 0;
4626 no_more_source:
4627 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4629 detect_info->rejected |= CATEGORY_MASK_SJIS;
4630 return 0;
4632 detect_info->found |= found;
4633 return 1;
4636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4637 Return true if a text is encoded in BIG5. */
4639 static bool
4640 detect_coding_big5 (struct coding_system *coding,
4641 struct coding_detection_info *detect_info)
4643 const unsigned char *src = coding->source, *src_base;
4644 const unsigned char *src_end = coding->source + coding->src_bytes;
4645 bool multibytep = coding->src_multibyte;
4646 ptrdiff_t consumed_chars = 0;
4647 int found = 0;
4648 int c;
4650 detect_info->checked |= CATEGORY_MASK_BIG5;
4651 /* A coding system of this category is always ASCII compatible. */
4652 src += coding->head_ascii;
4654 while (1)
4656 src_base = src;
4657 ONE_MORE_BYTE (c);
4658 if (c < 0x80)
4659 continue;
4660 if (c >= 0xA1)
4662 ONE_MORE_BYTE (c);
4663 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4664 return 0;
4665 found = CATEGORY_MASK_BIG5;
4667 else
4668 break;
4670 detect_info->rejected |= CATEGORY_MASK_BIG5;
4671 return 0;
4673 no_more_source:
4674 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676 detect_info->rejected |= CATEGORY_MASK_BIG5;
4677 return 0;
4679 detect_info->found |= found;
4680 return 1;
4683 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4685 static void
4686 decode_coding_sjis (struct coding_system *coding)
4688 const unsigned char *src = coding->source + coding->consumed;
4689 const unsigned char *src_end = coding->source + coding->src_bytes;
4690 const unsigned char *src_base;
4691 int *charbuf = coding->charbuf + coding->charbuf_used;
4692 /* We may produce one charset annotation in one loop and one more at
4693 the end. */
4694 int *charbuf_end
4695 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4696 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4697 bool multibytep = coding->src_multibyte;
4698 struct charset *charset_roman, *charset_kanji, *charset_kana;
4699 struct charset *charset_kanji2;
4700 Lisp_Object attrs, charset_list, val;
4701 ptrdiff_t char_offset = coding->produced_char;
4702 ptrdiff_t last_offset = char_offset;
4703 int last_id = charset_ascii;
4704 bool eol_dos
4705 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4706 int byte_after_cr = -1;
4708 CODING_GET_INFO (coding, attrs, charset_list);
4710 val = charset_list;
4711 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4712 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4713 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4714 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4716 while (1)
4718 int c, c1;
4719 struct charset *charset;
4721 src_base = src;
4722 consumed_chars_base = consumed_chars;
4724 if (charbuf >= charbuf_end)
4726 if (byte_after_cr >= 0)
4727 src_base--;
4728 break;
4731 if (byte_after_cr >= 0)
4732 c = byte_after_cr, byte_after_cr = -1;
4733 else
4734 ONE_MORE_BYTE (c);
4735 if (c < 0)
4736 goto invalid_code;
4737 if (c < 0x80)
4739 if (eol_dos && c == '\r')
4740 ONE_MORE_BYTE (byte_after_cr);
4741 charset = charset_roman;
4743 else if (c == 0x80 || c == 0xA0)
4744 goto invalid_code;
4745 else if (c >= 0xA1 && c <= 0xDF)
4747 /* SJIS -> JISX0201-Kana */
4748 c &= 0x7F;
4749 charset = charset_kana;
4751 else if (c <= 0xEF)
4753 /* SJIS -> JISX0208 */
4754 ONE_MORE_BYTE (c1);
4755 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4756 goto invalid_code;
4757 c = (c << 8) | c1;
4758 SJIS_TO_JIS (c);
4759 charset = charset_kanji;
4761 else if (c <= 0xFC && charset_kanji2)
4763 /* SJIS -> JISX0213-2 */
4764 ONE_MORE_BYTE (c1);
4765 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4766 goto invalid_code;
4767 c = (c << 8) | c1;
4768 SJIS_TO_JIS2 (c);
4769 charset = charset_kanji2;
4771 else
4772 goto invalid_code;
4773 if (charset->id != charset_ascii
4774 && last_id != charset->id)
4776 if (last_id != charset_ascii)
4777 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4778 last_id = charset->id;
4779 last_offset = char_offset;
4781 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4782 *charbuf++ = c;
4783 char_offset++;
4784 continue;
4786 invalid_code:
4787 src = src_base;
4788 consumed_chars = consumed_chars_base;
4789 ONE_MORE_BYTE (c);
4790 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4791 char_offset++;
4792 coding->errors++;
4795 no_more_source:
4796 if (last_id != charset_ascii)
4797 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798 coding->consumed_char += consumed_chars_base;
4799 coding->consumed = src_base - coding->source;
4800 coding->charbuf_used = charbuf - coding->charbuf;
4803 static void
4804 decode_coding_big5 (struct coding_system *coding)
4806 const unsigned char *src = coding->source + coding->consumed;
4807 const unsigned char *src_end = coding->source + coding->src_bytes;
4808 const unsigned char *src_base;
4809 int *charbuf = coding->charbuf + coding->charbuf_used;
4810 /* We may produce one charset annotation in one loop and one more at
4811 the end. */
4812 int *charbuf_end
4813 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4815 bool multibytep = coding->src_multibyte;
4816 struct charset *charset_roman, *charset_big5;
4817 Lisp_Object attrs, charset_list, val;
4818 ptrdiff_t char_offset = coding->produced_char;
4819 ptrdiff_t last_offset = char_offset;
4820 int last_id = charset_ascii;
4821 bool eol_dos
4822 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4823 int byte_after_cr = -1;
4825 CODING_GET_INFO (coding, attrs, charset_list);
4826 val = charset_list;
4827 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4830 while (1)
4832 int c, c1;
4833 struct charset *charset;
4835 src_base = src;
4836 consumed_chars_base = consumed_chars;
4838 if (charbuf >= charbuf_end)
4840 if (byte_after_cr >= 0)
4841 src_base--;
4842 break;
4845 if (byte_after_cr >= 0)
4846 c = byte_after_cr, byte_after_cr = -1;
4847 else
4848 ONE_MORE_BYTE (c);
4850 if (c < 0)
4851 goto invalid_code;
4852 if (c < 0x80)
4854 if (eol_dos && c == '\r')
4855 ONE_MORE_BYTE (byte_after_cr);
4856 charset = charset_roman;
4858 else
4860 /* BIG5 -> Big5 */
4861 if (c < 0xA1 || c > 0xFE)
4862 goto invalid_code;
4863 ONE_MORE_BYTE (c1);
4864 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4865 goto invalid_code;
4866 c = c << 8 | c1;
4867 charset = charset_big5;
4869 if (charset->id != charset_ascii
4870 && last_id != charset->id)
4872 if (last_id != charset_ascii)
4873 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4874 last_id = charset->id;
4875 last_offset = char_offset;
4877 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4878 *charbuf++ = c;
4879 char_offset++;
4880 continue;
4882 invalid_code:
4883 src = src_base;
4884 consumed_chars = consumed_chars_base;
4885 ONE_MORE_BYTE (c);
4886 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4887 char_offset++;
4888 coding->errors++;
4891 no_more_source:
4892 if (last_id != charset_ascii)
4893 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894 coding->consumed_char += consumed_chars_base;
4895 coding->consumed = src_base - coding->source;
4896 coding->charbuf_used = charbuf - coding->charbuf;
4899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4900 This function can encode charsets `ascii', `katakana-jisx0201',
4901 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4902 are sure that all these charsets are registered as official charset
4903 (i.e. do not have extended leading-codes). Characters of other
4904 charsets are produced without any encoding. */
4906 static bool
4907 encode_coding_sjis (struct coding_system *coding)
4909 bool multibytep = coding->dst_multibyte;
4910 int *charbuf = coding->charbuf;
4911 int *charbuf_end = charbuf + coding->charbuf_used;
4912 unsigned char *dst = coding->destination + coding->produced;
4913 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4914 int safe_room = 4;
4915 ptrdiff_t produced_chars = 0;
4916 Lisp_Object attrs, charset_list, val;
4917 bool ascii_compatible;
4918 struct charset *charset_kanji, *charset_kana;
4919 struct charset *charset_kanji2;
4920 int c;
4922 CODING_GET_INFO (coding, attrs, charset_list);
4923 val = XCDR (charset_list);
4924 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4925 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4926 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4928 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4930 while (charbuf < charbuf_end)
4932 ASSURE_DESTINATION (safe_room);
4933 c = *charbuf++;
4934 /* Now encode the character C. */
4935 if (ASCII_CHAR_P (c) && ascii_compatible)
4936 EMIT_ONE_ASCII_BYTE (c);
4937 else if (CHAR_BYTE8_P (c))
4939 c = CHAR_TO_BYTE8 (c);
4940 EMIT_ONE_BYTE (c);
4942 else
4944 unsigned code;
4945 struct charset *charset;
4946 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4947 &code, charset);
4949 if (!charset)
4951 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4953 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4954 charset = CHARSET_FROM_ID (charset_ascii);
4956 else
4958 c = coding->default_char;
4959 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4960 charset_list, &code, charset);
4963 if (code == CHARSET_INVALID_CODE (charset))
4964 emacs_abort ();
4965 if (charset == charset_kanji)
4967 int c1, c2;
4968 JIS_TO_SJIS (code);
4969 c1 = code >> 8, c2 = code & 0xFF;
4970 EMIT_TWO_BYTES (c1, c2);
4972 else if (charset == charset_kana)
4973 EMIT_ONE_BYTE (code | 0x80);
4974 else if (charset_kanji2 && charset == charset_kanji2)
4976 int c1, c2;
4978 c1 = code >> 8;
4979 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4980 || c1 == 0x28
4981 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4983 JIS_TO_SJIS2 (code);
4984 c1 = code >> 8, c2 = code & 0xFF;
4985 EMIT_TWO_BYTES (c1, c2);
4987 else
4988 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4990 else
4991 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4994 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995 coding->produced_char += produced_chars;
4996 coding->produced = dst - coding->destination;
4997 return 0;
5000 static bool
5001 encode_coding_big5 (struct coding_system *coding)
5003 bool multibytep = coding->dst_multibyte;
5004 int *charbuf = coding->charbuf;
5005 int *charbuf_end = charbuf + coding->charbuf_used;
5006 unsigned char *dst = coding->destination + coding->produced;
5007 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5008 int safe_room = 4;
5009 ptrdiff_t produced_chars = 0;
5010 Lisp_Object attrs, charset_list, val;
5011 bool ascii_compatible;
5012 struct charset *charset_big5;
5013 int c;
5015 CODING_GET_INFO (coding, attrs, charset_list);
5016 val = XCDR (charset_list);
5017 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5018 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5020 while (charbuf < charbuf_end)
5022 ASSURE_DESTINATION (safe_room);
5023 c = *charbuf++;
5024 /* Now encode the character C. */
5025 if (ASCII_CHAR_P (c) && ascii_compatible)
5026 EMIT_ONE_ASCII_BYTE (c);
5027 else if (CHAR_BYTE8_P (c))
5029 c = CHAR_TO_BYTE8 (c);
5030 EMIT_ONE_BYTE (c);
5032 else
5034 unsigned code;
5035 struct charset *charset;
5036 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5037 &code, charset);
5039 if (! charset)
5041 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5043 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5044 charset = CHARSET_FROM_ID (charset_ascii);
5046 else
5048 c = coding->default_char;
5049 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5050 charset_list, &code, charset);
5053 if (code == CHARSET_INVALID_CODE (charset))
5054 emacs_abort ();
5055 if (charset == charset_big5)
5057 int c1, c2;
5059 c1 = code >> 8, c2 = code & 0xFF;
5060 EMIT_TWO_BYTES (c1, c2);
5062 else
5063 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5066 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5067 coding->produced_char += produced_chars;
5068 coding->produced = dst - coding->destination;
5069 return 0;
5073 /*** 10. CCL handlers ***/
5075 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5076 Return true if a text is encoded in a coding system of which
5077 encoder/decoder are written in CCL program. */
5079 static bool
5080 detect_coding_ccl (struct coding_system *coding,
5081 struct coding_detection_info *detect_info)
5083 const unsigned char *src = coding->source, *src_base;
5084 const unsigned char *src_end = coding->source + coding->src_bytes;
5085 bool multibytep = coding->src_multibyte;
5086 ptrdiff_t consumed_chars = 0;
5087 int found = 0;
5088 unsigned char *valids;
5089 ptrdiff_t head_ascii = coding->head_ascii;
5090 Lisp_Object attrs;
5092 detect_info->checked |= CATEGORY_MASK_CCL;
5094 coding = &coding_categories[coding_category_ccl];
5095 valids = CODING_CCL_VALIDS (coding);
5096 attrs = CODING_ID_ATTRS (coding->id);
5097 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5098 src += head_ascii;
5100 while (1)
5102 int c;
5104 src_base = src;
5105 ONE_MORE_BYTE (c);
5106 if (c < 0 || ! valids[c])
5107 break;
5108 if ((valids[c] > 1))
5109 found = CATEGORY_MASK_CCL;
5111 detect_info->rejected |= CATEGORY_MASK_CCL;
5112 return 0;
5114 no_more_source:
5115 detect_info->found |= found;
5116 return 1;
5119 static void
5120 decode_coding_ccl (struct coding_system *coding)
5122 const unsigned char *src = coding->source + coding->consumed;
5123 const unsigned char *src_end = coding->source + coding->src_bytes;
5124 int *charbuf = coding->charbuf + coding->charbuf_used;
5125 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5126 ptrdiff_t consumed_chars = 0;
5127 bool multibytep = coding->src_multibyte;
5128 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5129 int source_charbuf[1024];
5130 int source_byteidx[1025];
5131 Lisp_Object attrs, charset_list;
5133 CODING_GET_INFO (coding, attrs, charset_list);
5135 while (1)
5137 const unsigned char *p = src;
5138 ptrdiff_t offset;
5139 int i = 0;
5141 if (multibytep)
5143 while (i < 1024 && p < src_end)
5145 source_byteidx[i] = p - src;
5146 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5148 source_byteidx[i] = p - src;
5150 else
5151 while (i < 1024 && p < src_end)
5152 source_charbuf[i++] = *p++;
5154 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5155 ccl->last_block = 1;
5156 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5157 charset_map_loaded = 0;
5158 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5159 charset_list);
5160 if (charset_map_loaded
5161 && (offset = coding_change_source (coding)))
5163 p += offset;
5164 src += offset;
5165 src_end += offset;
5167 charbuf += ccl->produced;
5168 if (multibytep)
5169 src += source_byteidx[ccl->consumed];
5170 else
5171 src += ccl->consumed;
5172 consumed_chars += ccl->consumed;
5173 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5174 break;
5177 switch (ccl->status)
5179 case CCL_STAT_SUSPEND_BY_SRC:
5180 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181 break;
5182 case CCL_STAT_SUSPEND_BY_DST:
5183 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184 break;
5185 case CCL_STAT_QUIT:
5186 case CCL_STAT_INVALID_CMD:
5187 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188 break;
5189 default:
5190 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191 break;
5193 coding->consumed_char += consumed_chars;
5194 coding->consumed = src - coding->source;
5195 coding->charbuf_used = charbuf - coding->charbuf;
5198 static bool
5199 encode_coding_ccl (struct coding_system *coding)
5201 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5202 bool multibytep = coding->dst_multibyte;
5203 int *charbuf = coding->charbuf;
5204 int *charbuf_end = charbuf + coding->charbuf_used;
5205 unsigned char *dst = coding->destination + coding->produced;
5206 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5207 int destination_charbuf[1024];
5208 ptrdiff_t produced_chars = 0;
5209 int i;
5210 Lisp_Object attrs, charset_list;
5212 CODING_GET_INFO (coding, attrs, charset_list);
5213 if (coding->consumed_char == coding->src_chars
5214 && coding->mode & CODING_MODE_LAST_BLOCK)
5215 ccl->last_block = 1;
5219 ptrdiff_t offset;
5221 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5222 charset_map_loaded = 0;
5223 ccl_driver (ccl, charbuf, destination_charbuf,
5224 charbuf_end - charbuf, 1024, charset_list);
5225 if (charset_map_loaded
5226 && (offset = coding_change_destination (coding)))
5227 dst += offset;
5228 if (multibytep)
5230 ASSURE_DESTINATION (ccl->produced * 2);
5231 for (i = 0; i < ccl->produced; i++)
5232 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5234 else
5236 ASSURE_DESTINATION (ccl->produced);
5237 for (i = 0; i < ccl->produced; i++)
5238 *dst++ = destination_charbuf[i] & 0xFF;
5239 produced_chars += ccl->produced;
5241 charbuf += ccl->consumed;
5242 if (ccl->status == CCL_STAT_QUIT
5243 || ccl->status == CCL_STAT_INVALID_CMD)
5244 break;
5246 while (charbuf < charbuf_end);
5248 switch (ccl->status)
5250 case CCL_STAT_SUSPEND_BY_SRC:
5251 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5252 break;
5253 case CCL_STAT_SUSPEND_BY_DST:
5254 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5255 break;
5256 case CCL_STAT_QUIT:
5257 case CCL_STAT_INVALID_CMD:
5258 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5259 break;
5260 default:
5261 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5262 break;
5265 coding->produced_char += produced_chars;
5266 coding->produced = dst - coding->destination;
5267 return 0;
5271 /*** 10, 11. no-conversion handlers ***/
5273 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5275 static void
5276 decode_coding_raw_text (struct coding_system *coding)
5278 bool eol_dos
5279 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5281 coding->chars_at_source = 1;
5282 coding->consumed_char = coding->src_chars;
5283 coding->consumed = coding->src_bytes;
5284 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5286 coding->consumed_char--;
5287 coding->consumed--;
5288 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5290 else
5291 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5294 static bool
5295 encode_coding_raw_text (struct coding_system *coding)
5297 bool multibytep = coding->dst_multibyte;
5298 int *charbuf = coding->charbuf;
5299 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5300 unsigned char *dst = coding->destination + coding->produced;
5301 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5302 ptrdiff_t produced_chars = 0;
5303 int c;
5305 if (multibytep)
5307 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5309 if (coding->src_multibyte)
5310 while (charbuf < charbuf_end)
5312 ASSURE_DESTINATION (safe_room);
5313 c = *charbuf++;
5314 if (ASCII_CHAR_P (c))
5315 EMIT_ONE_ASCII_BYTE (c);
5316 else if (CHAR_BYTE8_P (c))
5318 c = CHAR_TO_BYTE8 (c);
5319 EMIT_ONE_BYTE (c);
5321 else
5323 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5325 CHAR_STRING_ADVANCE (c, p1);
5328 EMIT_ONE_BYTE (*p0);
5329 p0++;
5331 while (p0 < p1);
5334 else
5335 while (charbuf < charbuf_end)
5337 ASSURE_DESTINATION (safe_room);
5338 c = *charbuf++;
5339 EMIT_ONE_BYTE (c);
5342 else
5344 if (coding->src_multibyte)
5346 int safe_room = MAX_MULTIBYTE_LENGTH;
5348 while (charbuf < charbuf_end)
5350 ASSURE_DESTINATION (safe_room);
5351 c = *charbuf++;
5352 if (ASCII_CHAR_P (c))
5353 *dst++ = c;
5354 else if (CHAR_BYTE8_P (c))
5355 *dst++ = CHAR_TO_BYTE8 (c);
5356 else
5357 CHAR_STRING_ADVANCE (c, dst);
5360 else
5362 ASSURE_DESTINATION (charbuf_end - charbuf);
5363 while (charbuf < charbuf_end && dst < dst_end)
5364 *dst++ = *charbuf++;
5366 produced_chars = dst - (coding->destination + coding->produced);
5368 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5369 coding->produced_char += produced_chars;
5370 coding->produced = dst - coding->destination;
5371 return 0;
5374 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5375 Return true if a text is encoded in a charset-based coding system. */
5377 static bool
5378 detect_coding_charset (struct coding_system *coding,
5379 struct coding_detection_info *detect_info)
5381 const unsigned char *src = coding->source, *src_base;
5382 const unsigned char *src_end = coding->source + coding->src_bytes;
5383 bool multibytep = coding->src_multibyte;
5384 ptrdiff_t consumed_chars = 0;
5385 Lisp_Object attrs, valids, name;
5386 int found = 0;
5387 ptrdiff_t head_ascii = coding->head_ascii;
5388 bool check_latin_extra = 0;
5390 detect_info->checked |= CATEGORY_MASK_CHARSET;
5392 coding = &coding_categories[coding_category_charset];
5393 attrs = CODING_ID_ATTRS (coding->id);
5394 valids = AREF (attrs, coding_attr_charset_valids);
5395 name = CODING_ID_NAME (coding->id);
5396 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5397 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5398 || strncmp (SSDATA (SYMBOL_NAME (name)),
5399 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5400 check_latin_extra = 1;
5402 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5403 src += head_ascii;
5405 while (1)
5407 int c;
5408 Lisp_Object val;
5409 struct charset *charset;
5410 int dim, idx;
5412 src_base = src;
5413 ONE_MORE_BYTE (c);
5414 if (c < 0)
5415 continue;
5416 val = AREF (valids, c);
5417 if (NILP (val))
5418 break;
5419 if (c >= 0x80)
5421 if (c < 0xA0
5422 && check_latin_extra
5423 && (!VECTORP (Vlatin_extra_code_table)
5424 || NILP (AREF (Vlatin_extra_code_table, c))))
5425 break;
5426 found = CATEGORY_MASK_CHARSET;
5428 if (INTEGERP (val))
5430 charset = CHARSET_FROM_ID (XFASTINT (val));
5431 dim = CHARSET_DIMENSION (charset);
5432 for (idx = 1; idx < dim; idx++)
5434 if (src == src_end)
5435 goto too_short;
5436 ONE_MORE_BYTE (c);
5437 if (c < charset->code_space[(dim - 1 - idx) * 4]
5438 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5439 break;
5441 if (idx < dim)
5442 break;
5444 else
5446 idx = 1;
5447 for (; CONSP (val); val = XCDR (val))
5449 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5450 dim = CHARSET_DIMENSION (charset);
5451 while (idx < dim)
5453 if (src == src_end)
5454 goto too_short;
5455 ONE_MORE_BYTE (c);
5456 if (c < charset->code_space[(dim - 1 - idx) * 4]
5457 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458 break;
5459 idx++;
5461 if (idx == dim)
5463 val = Qnil;
5464 break;
5467 if (CONSP (val))
5468 break;
5471 too_short:
5472 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5473 return 0;
5475 no_more_source:
5476 detect_info->found |= found;
5477 return 1;
5480 static void
5481 decode_coding_charset (struct coding_system *coding)
5483 const unsigned char *src = coding->source + coding->consumed;
5484 const unsigned char *src_end = coding->source + coding->src_bytes;
5485 const unsigned char *src_base;
5486 int *charbuf = coding->charbuf + coding->charbuf_used;
5487 /* We may produce one charset annotation in one loop and one more at
5488 the end. */
5489 int *charbuf_end
5490 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5491 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5492 bool multibytep = coding->src_multibyte;
5493 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5494 Lisp_Object valids;
5495 ptrdiff_t char_offset = coding->produced_char;
5496 ptrdiff_t last_offset = char_offset;
5497 int last_id = charset_ascii;
5498 bool eol_dos
5499 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5500 int byte_after_cr = -1;
5502 valids = AREF (attrs, coding_attr_charset_valids);
5504 while (1)
5506 int c;
5507 Lisp_Object val;
5508 struct charset *charset;
5509 int dim;
5510 int len = 1;
5511 unsigned code;
5513 src_base = src;
5514 consumed_chars_base = consumed_chars;
5516 if (charbuf >= charbuf_end)
5518 if (byte_after_cr >= 0)
5519 src_base--;
5520 break;
5523 if (byte_after_cr >= 0)
5525 c = byte_after_cr;
5526 byte_after_cr = -1;
5528 else
5530 ONE_MORE_BYTE (c);
5531 if (eol_dos && c == '\r')
5532 ONE_MORE_BYTE (byte_after_cr);
5534 if (c < 0)
5535 goto invalid_code;
5536 code = c;
5538 val = AREF (valids, c);
5539 if (! INTEGERP (val) && ! CONSP (val))
5540 goto invalid_code;
5541 if (INTEGERP (val))
5543 charset = CHARSET_FROM_ID (XFASTINT (val));
5544 dim = CHARSET_DIMENSION (charset);
5545 while (len < dim)
5547 ONE_MORE_BYTE (c);
5548 code = (code << 8) | c;
5549 len++;
5551 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5552 charset, code, c);
5554 else
5556 /* VAL is a list of charset IDs. It is assured that the
5557 list is sorted by charset dimensions (smaller one
5558 comes first). */
5559 while (CONSP (val))
5561 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5562 dim = CHARSET_DIMENSION (charset);
5563 while (len < dim)
5565 ONE_MORE_BYTE (c);
5566 code = (code << 8) | c;
5567 len++;
5569 CODING_DECODE_CHAR (coding, src, src_base,
5570 src_end, charset, code, c);
5571 if (c >= 0)
5572 break;
5573 val = XCDR (val);
5576 if (c < 0)
5577 goto invalid_code;
5578 if (charset->id != charset_ascii
5579 && last_id != charset->id)
5581 if (last_id != charset_ascii)
5582 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5583 last_id = charset->id;
5584 last_offset = char_offset;
5587 *charbuf++ = c;
5588 char_offset++;
5589 continue;
5591 invalid_code:
5592 src = src_base;
5593 consumed_chars = consumed_chars_base;
5594 ONE_MORE_BYTE (c);
5595 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5596 char_offset++;
5597 coding->errors++;
5600 no_more_source:
5601 if (last_id != charset_ascii)
5602 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5603 coding->consumed_char += consumed_chars_base;
5604 coding->consumed = src_base - coding->source;
5605 coding->charbuf_used = charbuf - coding->charbuf;
5608 static bool
5609 encode_coding_charset (struct coding_system *coding)
5611 bool multibytep = coding->dst_multibyte;
5612 int *charbuf = coding->charbuf;
5613 int *charbuf_end = charbuf + coding->charbuf_used;
5614 unsigned char *dst = coding->destination + coding->produced;
5615 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5616 int safe_room = MAX_MULTIBYTE_LENGTH;
5617 ptrdiff_t produced_chars = 0;
5618 Lisp_Object attrs, charset_list;
5619 bool ascii_compatible;
5620 int c;
5622 CODING_GET_INFO (coding, attrs, charset_list);
5623 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5625 while (charbuf < charbuf_end)
5627 struct charset *charset;
5628 unsigned code;
5630 ASSURE_DESTINATION (safe_room);
5631 c = *charbuf++;
5632 if (ascii_compatible && ASCII_CHAR_P (c))
5633 EMIT_ONE_ASCII_BYTE (c);
5634 else if (CHAR_BYTE8_P (c))
5636 c = CHAR_TO_BYTE8 (c);
5637 EMIT_ONE_BYTE (c);
5639 else
5641 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5642 &code, charset);
5644 if (charset)
5646 if (CHARSET_DIMENSION (charset) == 1)
5647 EMIT_ONE_BYTE (code);
5648 else if (CHARSET_DIMENSION (charset) == 2)
5649 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5650 else if (CHARSET_DIMENSION (charset) == 3)
5651 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5652 else
5653 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5654 (code >> 8) & 0xFF, code & 0xFF);
5656 else
5658 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5659 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5660 else
5661 c = coding->default_char;
5662 EMIT_ONE_BYTE (c);
5667 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5668 coding->produced_char += produced_chars;
5669 coding->produced = dst - coding->destination;
5670 return 0;
5674 /*** 7. C library functions ***/
5676 /* Setup coding context CODING from information about CODING_SYSTEM.
5677 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5678 CODING_SYSTEM is invalid, signal an error. */
5680 void
5681 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5683 Lisp_Object attrs;
5684 Lisp_Object eol_type;
5685 Lisp_Object coding_type;
5686 Lisp_Object val;
5688 if (NILP (coding_system))
5689 coding_system = Qundecided;
5691 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5693 attrs = CODING_ID_ATTRS (coding->id);
5694 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5696 coding->mode = 0;
5697 if (VECTORP (eol_type))
5698 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5699 | CODING_REQUIRE_DETECTION_MASK);
5700 else if (! EQ (eol_type, Qunix))
5701 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5702 | CODING_REQUIRE_ENCODING_MASK);
5703 else
5704 coding->common_flags = 0;
5705 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5706 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5707 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5708 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5709 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5710 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5712 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5713 coding->max_charset_id = SCHARS (val) - 1;
5714 coding->safe_charsets = SDATA (val);
5715 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5716 coding->carryover_bytes = 0;
5718 coding_type = CODING_ATTR_TYPE (attrs);
5719 if (EQ (coding_type, Qundecided))
5721 coding->detector = NULL;
5722 coding->decoder = decode_coding_raw_text;
5723 coding->encoder = encode_coding_raw_text;
5724 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5725 coding->spec.undecided.inhibit_nbd
5726 = (encode_inhibit_flag
5727 (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5728 coding->spec.undecided.inhibit_ied
5729 = (encode_inhibit_flag
5730 (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5731 coding->spec.undecided.prefer_utf_8
5732 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5734 else if (EQ (coding_type, Qiso_2022))
5736 int i;
5737 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5739 /* Invoke graphic register 0 to plane 0. */
5740 CODING_ISO_INVOCATION (coding, 0) = 0;
5741 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5742 CODING_ISO_INVOCATION (coding, 1)
5743 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5744 /* Setup the initial status of designation. */
5745 for (i = 0; i < 4; i++)
5746 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5747 /* Not single shifting initially. */
5748 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5749 /* Beginning of buffer should also be regarded as bol. */
5750 CODING_ISO_BOL (coding) = 1;
5751 coding->detector = detect_coding_iso_2022;
5752 coding->decoder = decode_coding_iso_2022;
5753 coding->encoder = encode_coding_iso_2022;
5754 if (flags & CODING_ISO_FLAG_SAFE)
5755 coding->mode |= CODING_MODE_SAFE_ENCODING;
5756 coding->common_flags
5757 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5758 | CODING_REQUIRE_FLUSHING_MASK);
5759 if (flags & CODING_ISO_FLAG_COMPOSITION)
5760 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5761 if (flags & CODING_ISO_FLAG_DESIGNATION)
5762 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5763 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5765 setup_iso_safe_charsets (attrs);
5766 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5767 coding->max_charset_id = SCHARS (val) - 1;
5768 coding->safe_charsets = SDATA (val);
5770 CODING_ISO_FLAGS (coding) = flags;
5771 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5772 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5773 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5774 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5776 else if (EQ (coding_type, Qcharset))
5778 coding->detector = detect_coding_charset;
5779 coding->decoder = decode_coding_charset;
5780 coding->encoder = encode_coding_charset;
5781 coding->common_flags
5782 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5784 else if (EQ (coding_type, Qutf_8))
5786 val = AREF (attrs, coding_attr_utf_bom);
5787 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5788 : EQ (val, Qt) ? utf_with_bom
5789 : utf_without_bom);
5790 coding->detector = detect_coding_utf_8;
5791 coding->decoder = decode_coding_utf_8;
5792 coding->encoder = encode_coding_utf_8;
5793 coding->common_flags
5794 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5796 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5798 else if (EQ (coding_type, Qutf_16))
5800 val = AREF (attrs, coding_attr_utf_bom);
5801 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5802 : EQ (val, Qt) ? utf_with_bom
5803 : utf_without_bom);
5804 val = AREF (attrs, coding_attr_utf_16_endian);
5805 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5806 : utf_16_little_endian);
5807 CODING_UTF_16_SURROGATE (coding) = 0;
5808 coding->detector = detect_coding_utf_16;
5809 coding->decoder = decode_coding_utf_16;
5810 coding->encoder = encode_coding_utf_16;
5811 coding->common_flags
5812 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5813 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5814 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5816 else if (EQ (coding_type, Qccl))
5818 coding->detector = detect_coding_ccl;
5819 coding->decoder = decode_coding_ccl;
5820 coding->encoder = encode_coding_ccl;
5821 coding->common_flags
5822 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5823 | CODING_REQUIRE_FLUSHING_MASK);
5825 else if (EQ (coding_type, Qemacs_mule))
5827 coding->detector = detect_coding_emacs_mule;
5828 coding->decoder = decode_coding_emacs_mule;
5829 coding->encoder = encode_coding_emacs_mule;
5830 coding->common_flags
5831 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5833 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5835 Lisp_Object tail, safe_charsets;
5836 int max_charset_id = 0;
5838 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5839 tail = XCDR (tail))
5840 if (max_charset_id < XFASTINT (XCAR (tail)))
5841 max_charset_id = XFASTINT (XCAR (tail));
5842 safe_charsets = make_uninit_string (max_charset_id + 1);
5843 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5844 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5845 tail = XCDR (tail))
5846 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5847 coding->max_charset_id = max_charset_id;
5848 coding->safe_charsets = SDATA (safe_charsets);
5850 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5851 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5853 else if (EQ (coding_type, Qshift_jis))
5855 coding->detector = detect_coding_sjis;
5856 coding->decoder = decode_coding_sjis;
5857 coding->encoder = encode_coding_sjis;
5858 coding->common_flags
5859 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5861 else if (EQ (coding_type, Qbig5))
5863 coding->detector = detect_coding_big5;
5864 coding->decoder = decode_coding_big5;
5865 coding->encoder = encode_coding_big5;
5866 coding->common_flags
5867 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5869 else /* EQ (coding_type, Qraw_text) */
5871 coding->detector = NULL;
5872 coding->decoder = decode_coding_raw_text;
5873 coding->encoder = encode_coding_raw_text;
5874 if (! EQ (eol_type, Qunix))
5876 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5877 if (! VECTORP (eol_type))
5878 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5883 return;
5886 /* Return a list of charsets supported by CODING. */
5888 Lisp_Object
5889 coding_charset_list (struct coding_system *coding)
5891 Lisp_Object attrs, charset_list;
5893 CODING_GET_INFO (coding, attrs, charset_list);
5894 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5896 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5898 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5899 charset_list = Viso_2022_charset_list;
5901 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5903 charset_list = Vemacs_mule_charset_list;
5905 return charset_list;
5909 /* Return a list of charsets supported by CODING-SYSTEM. */
5911 Lisp_Object
5912 coding_system_charset_list (Lisp_Object coding_system)
5914 ptrdiff_t id;
5915 Lisp_Object attrs, charset_list;
5917 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5918 attrs = CODING_ID_ATTRS (id);
5920 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5922 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5924 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5925 charset_list = Viso_2022_charset_list;
5926 else
5927 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5929 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5931 charset_list = Vemacs_mule_charset_list;
5933 else
5935 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5937 return charset_list;
5941 /* Return raw-text or one of its subsidiaries that has the same
5942 eol_type as CODING-SYSTEM. */
5944 Lisp_Object
5945 raw_text_coding_system (Lisp_Object coding_system)
5947 Lisp_Object spec, attrs;
5948 Lisp_Object eol_type, raw_text_eol_type;
5950 if (NILP (coding_system))
5951 return Qraw_text;
5952 spec = CODING_SYSTEM_SPEC (coding_system);
5953 attrs = AREF (spec, 0);
5955 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5956 return coding_system;
5958 eol_type = AREF (spec, 2);
5959 if (VECTORP (eol_type))
5960 return Qraw_text;
5961 spec = CODING_SYSTEM_SPEC (Qraw_text);
5962 raw_text_eol_type = AREF (spec, 2);
5963 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5964 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5965 : AREF (raw_text_eol_type, 2));
5969 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5970 the subsidiary that has the same eol-spec as PARENT (if it is not
5971 nil and specifies end-of-line format) or the system's setting
5972 (system_eol_type). */
5974 Lisp_Object
5975 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5977 Lisp_Object spec, eol_type;
5979 if (NILP (coding_system))
5980 coding_system = Qraw_text;
5981 spec = CODING_SYSTEM_SPEC (coding_system);
5982 eol_type = AREF (spec, 2);
5983 if (VECTORP (eol_type))
5985 Lisp_Object parent_eol_type;
5987 if (! NILP (parent))
5989 Lisp_Object parent_spec;
5991 parent_spec = CODING_SYSTEM_SPEC (parent);
5992 parent_eol_type = AREF (parent_spec, 2);
5993 if (VECTORP (parent_eol_type))
5994 parent_eol_type = system_eol_type;
5996 else
5997 parent_eol_type = system_eol_type;
5998 if (EQ (parent_eol_type, Qunix))
5999 coding_system = AREF (eol_type, 0);
6000 else if (EQ (parent_eol_type, Qdos))
6001 coding_system = AREF (eol_type, 1);
6002 else if (EQ (parent_eol_type, Qmac))
6003 coding_system = AREF (eol_type, 2);
6005 return coding_system;
6009 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6010 decided for writing to a process. If not, complement them, and
6011 return a new coding system. */
6013 Lisp_Object
6014 complement_process_encoding_system (Lisp_Object coding_system)
6016 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6017 Lisp_Object spec, attrs;
6018 int i;
6020 for (i = 0; i < 3; i++)
6022 if (i == 1)
6023 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6024 else if (i == 2)
6025 coding_system = preferred_coding_system ();
6026 spec = CODING_SYSTEM_SPEC (coding_system);
6027 if (NILP (spec))
6028 continue;
6029 attrs = AREF (spec, 0);
6030 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6031 coding_base = CODING_ATTR_BASE_NAME (attrs);
6032 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6033 eol_base = coding_system;
6034 if (! NILP (coding_base) && ! NILP (eol_base))
6035 break;
6038 if (i > 0)
6039 /* The original CODING_SYSTEM didn't specify text-conversion or
6040 eol-conversion. Be sure that we return a fully complemented
6041 coding system. */
6042 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6043 return coding_system;
6047 /* Emacs has a mechanism to automatically detect a coding system if it
6048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6049 it's impossible to distinguish some coding systems accurately
6050 because they use the same range of codes. So, at first, coding
6051 systems are categorized into 7, those are:
6053 o coding-category-emacs-mule
6055 The category for a coding system which has the same code range
6056 as Emacs' internal format. Assigned the coding-system (Lisp
6057 symbol) `emacs-mule' by default.
6059 o coding-category-sjis
6061 The category for a coding system which has the same code range
6062 as SJIS. Assigned the coding-system (Lisp
6063 symbol) `japanese-shift-jis' by default.
6065 o coding-category-iso-7
6067 The category for a coding system which has the same code range
6068 as ISO2022 of 7-bit environment. This doesn't use any locking
6069 shift and single shift functions. This can encode/decode all
6070 charsets. Assigned the coding-system (Lisp symbol)
6071 `iso-2022-7bit' by default.
6073 o coding-category-iso-7-tight
6075 Same as coding-category-iso-7 except that this can
6076 encode/decode only the specified charsets.
6078 o coding-category-iso-8-1
6080 The category for a coding system which has the same code range
6081 as ISO2022 of 8-bit environment and graphic plane 1 used only
6082 for DIMENSION1 charset. This doesn't use any locking shift
6083 and single shift functions. Assigned the coding-system (Lisp
6084 symbol) `iso-latin-1' by default.
6086 o coding-category-iso-8-2
6088 The category for a coding system which has the same code range
6089 as ISO2022 of 8-bit environment and graphic plane 1 used only
6090 for DIMENSION2 charset. This doesn't use any locking shift
6091 and single shift functions. Assigned the coding-system (Lisp
6092 symbol) `japanese-iso-8bit' by default.
6094 o coding-category-iso-7-else
6096 The category for a coding system which has the same code range
6097 as ISO2022 of 7-bit environment but uses locking shift or
6098 single shift functions. Assigned the coding-system (Lisp
6099 symbol) `iso-2022-7bit-lock' by default.
6101 o coding-category-iso-8-else
6103 The category for a coding system which has the same code range
6104 as ISO2022 of 8-bit environment but uses locking shift or
6105 single shift functions. Assigned the coding-system (Lisp
6106 symbol) `iso-2022-8bit-ss2' by default.
6108 o coding-category-big5
6110 The category for a coding system which has the same code range
6111 as BIG5. Assigned the coding-system (Lisp symbol)
6112 `cn-big5' by default.
6114 o coding-category-utf-8
6116 The category for a coding system which has the same code range
6117 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6118 symbol) `utf-8' by default.
6120 o coding-category-utf-16-be
6122 The category for a coding system in which a text has an
6123 Unicode signature (cf. Unicode Standard) in the order of BIG
6124 endian at the head. Assigned the coding-system (Lisp symbol)
6125 `utf-16-be' by default.
6127 o coding-category-utf-16-le
6129 The category for a coding system in which a text has an
6130 Unicode signature (cf. Unicode Standard) in the order of
6131 LITTLE endian at the head. Assigned the coding-system (Lisp
6132 symbol) `utf-16-le' by default.
6134 o coding-category-ccl
6136 The category for a coding system of which encoder/decoder is
6137 written in CCL programs. The default value is nil, i.e., no
6138 coding system is assigned.
6140 o coding-category-binary
6142 The category for a coding system not categorized in any of the
6143 above. Assigned the coding-system (Lisp symbol)
6144 `no-conversion' by default.
6146 Each of them is a Lisp symbol and the value is an actual
6147 `coding-system's (this is also a Lisp symbol) assigned by a user.
6148 What Emacs does actually is to detect a category of coding system.
6149 Then, it uses a `coding-system' assigned to it. If Emacs can't
6150 decide only one possible category, it selects a category of the
6151 highest priority. Priorities of categories are also specified by a
6152 user in a Lisp variable `coding-category-list'.
6156 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6157 int eol_seen);
6160 /* Return the number of ASCII characters at the head of the source.
6161 By side effects, set coding->head_ascii and update
6162 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6163 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6164 reliable only when all the source bytes are ASCII. */
6166 static int
6167 check_ascii (struct coding_system *coding)
6169 const unsigned char *src, *end;
6170 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6171 int eol_seen = coding->eol_seen;
6173 coding_set_source (coding);
6174 src = coding->source;
6175 end = src + coding->src_bytes;
6177 if (inhibit_eol_conversion
6178 || SYMBOLP (eol_type))
6180 /* We don't have to check EOL format. */
6181 while (src < end && !( *src & 0x80))
6183 if (*src++ == '\n')
6184 eol_seen |= EOL_SEEN_LF;
6187 else
6189 end--; /* We look ahead one byte for "CR LF". */
6190 while (src < end)
6192 int c = *src;
6194 if (c & 0x80)
6195 break;
6196 src++;
6197 if (c == '\r')
6199 if (*src == '\n')
6201 eol_seen |= EOL_SEEN_CRLF;
6202 src++;
6204 else
6205 eol_seen |= EOL_SEEN_CR;
6207 else if (c == '\n')
6208 eol_seen |= EOL_SEEN_LF;
6210 if (src == end)
6212 int c = *src;
6214 /* All bytes but the last one C are ASCII. */
6215 if (! (c & 0x80))
6217 if (c == '\r')
6218 eol_seen |= EOL_SEEN_CR;
6219 else if (c == '\n')
6220 eol_seen |= EOL_SEEN_LF;
6221 src++;
6225 coding->head_ascii = src - coding->source;
6226 coding->eol_seen = eol_seen;
6227 return (coding->head_ascii);
6231 /* Return the number of characters at the source if all the bytes are
6232 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6233 effects, update coding->eol_seen. The value of coding->eol_seen is
6234 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6235 the value is reliable only when all the source bytes are valid
6236 UTF-8. */
6238 static int
6239 check_utf_8 (struct coding_system *coding)
6241 const unsigned char *src, *end;
6242 int eol_seen;
6243 int nchars = coding->head_ascii;
6245 if (coding->head_ascii < 0)
6246 check_ascii (coding);
6247 else
6248 coding_set_source (coding);
6249 src = coding->source + coding->head_ascii;
6250 /* We look ahead one byte for CR LF. */
6251 end = coding->source + coding->src_bytes - 1;
6252 eol_seen = coding->eol_seen;
6253 while (src < end)
6255 int c = *src;
6257 if (UTF_8_1_OCTET_P (*src))
6259 src++;
6260 if (c < 0x20)
6262 if (c == '\r')
6264 if (*src == '\n')
6266 eol_seen |= EOL_SEEN_CRLF;
6267 src++;
6268 nchars++;
6270 else
6271 eol_seen |= EOL_SEEN_CR;
6273 else if (c == '\n')
6274 eol_seen |= EOL_SEEN_LF;
6277 else if (UTF_8_2_OCTET_LEADING_P (c))
6279 if (c < 0xC2 /* overlong sequence */
6280 || src + 1 >= end
6281 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6282 return -1;
6283 src += 2;
6285 else if (UTF_8_3_OCTET_LEADING_P (c))
6287 if (src + 2 >= end
6288 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6289 && UTF_8_EXTRA_OCTET_P (src[2])))
6290 return -1;
6291 c = (((c & 0xF) << 12)
6292 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6293 if (c < 0x800 /* overlong sequence */
6294 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6295 return -1;
6296 src += 3;
6298 else if (UTF_8_4_OCTET_LEADING_P (c))
6300 if (src + 3 >= end
6301 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6302 && UTF_8_EXTRA_OCTET_P (src[2])
6303 && UTF_8_EXTRA_OCTET_P (src[3])))
6304 return -1;
6305 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6306 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6307 if (c < 0x10000 /* overlong sequence */
6308 || c >= 0x110000) /* non-Unicode character */
6309 return -1;
6310 src += 4;
6312 else
6313 return -1;
6314 nchars++;
6317 if (src == end)
6319 if (! UTF_8_1_OCTET_P (*src))
6320 return -1;
6321 nchars++;
6322 if (*src == '\r')
6323 eol_seen |= EOL_SEEN_CR;
6324 else if (*src == '\n')
6325 eol_seen |= EOL_SEEN_LF;
6327 coding->eol_seen = eol_seen;
6328 return nchars;
6332 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6333 SOURCE is encoded. If CATEGORY is one of
6334 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6335 two-byte, else they are encoded by one-byte.
6337 Return one of EOL_SEEN_XXX. */
6339 #define MAX_EOL_CHECK_COUNT 3
6341 static int
6342 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6343 enum coding_category category)
6345 const unsigned char *src = source, *src_end = src + src_bytes;
6346 unsigned char c;
6347 int total = 0;
6348 int eol_seen = EOL_SEEN_NONE;
6350 if ((1 << category) & CATEGORY_MASK_UTF_16)
6352 bool msb = category == (coding_category_utf_16_le
6353 | coding_category_utf_16_le_nosig);
6354 bool lsb = !msb;
6356 while (src + 1 < src_end)
6358 c = src[lsb];
6359 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6361 int this_eol;
6363 if (c == '\n')
6364 this_eol = EOL_SEEN_LF;
6365 else if (src + 3 >= src_end
6366 || src[msb + 2] != 0
6367 || src[lsb + 2] != '\n')
6368 this_eol = EOL_SEEN_CR;
6369 else
6371 this_eol = EOL_SEEN_CRLF;
6372 src += 2;
6375 if (eol_seen == EOL_SEEN_NONE)
6376 /* This is the first end-of-line. */
6377 eol_seen = this_eol;
6378 else if (eol_seen != this_eol)
6380 /* The found type is different from what found before.
6381 Allow for stray ^M characters in DOS EOL files. */
6382 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6383 || (eol_seen == EOL_SEEN_CRLF
6384 && this_eol == EOL_SEEN_CR))
6385 eol_seen = EOL_SEEN_CRLF;
6386 else
6388 eol_seen = EOL_SEEN_LF;
6389 break;
6392 if (++total == MAX_EOL_CHECK_COUNT)
6393 break;
6395 src += 2;
6398 else
6399 while (src < src_end)
6401 c = *src++;
6402 if (c == '\n' || c == '\r')
6404 int this_eol;
6406 if (c == '\n')
6407 this_eol = EOL_SEEN_LF;
6408 else if (src >= src_end || *src != '\n')
6409 this_eol = EOL_SEEN_CR;
6410 else
6411 this_eol = EOL_SEEN_CRLF, src++;
6413 if (eol_seen == EOL_SEEN_NONE)
6414 /* This is the first end-of-line. */
6415 eol_seen = this_eol;
6416 else if (eol_seen != this_eol)
6418 /* The found type is different from what found before.
6419 Allow for stray ^M characters in DOS EOL files. */
6420 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6421 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6422 eol_seen = EOL_SEEN_CRLF;
6423 else
6425 eol_seen = EOL_SEEN_LF;
6426 break;
6429 if (++total == MAX_EOL_CHECK_COUNT)
6430 break;
6433 return eol_seen;
6437 static Lisp_Object
6438 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6440 Lisp_Object eol_type;
6442 eol_type = CODING_ID_EOL_TYPE (coding->id);
6443 if (! VECTORP (eol_type))
6444 /* Already adjusted. */
6445 return eol_type;
6446 if (eol_seen & EOL_SEEN_LF)
6448 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6449 eol_type = Qunix;
6451 else if (eol_seen & EOL_SEEN_CRLF)
6453 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6454 eol_type = Qdos;
6456 else if (eol_seen & EOL_SEEN_CR)
6458 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6459 eol_type = Qmac;
6461 return eol_type;
6464 /* Detect how a text specified in CODING is encoded. If a coding
6465 system is detected, update fields of CODING by the detected coding
6466 system. */
6468 static void
6469 detect_coding (struct coding_system *coding)
6471 const unsigned char *src, *src_end;
6472 unsigned int saved_mode = coding->mode;
6473 Lisp_Object found = Qnil;
6474 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6476 coding->consumed = coding->consumed_char = 0;
6477 coding->produced = coding->produced_char = 0;
6478 coding_set_source (coding);
6480 src_end = coding->source + coding->src_bytes;
6482 coding->eol_seen = EOL_SEEN_NONE;
6483 /* If we have not yet decided the text encoding type, detect it
6484 now. */
6485 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6487 int c, i;
6488 struct coding_detection_info detect_info;
6489 bool null_byte_found = 0, eight_bit_found = 0;
6490 bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6491 inhibit_null_byte_detection);
6492 bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6493 inhibit_iso_escape_detection);
6494 bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6496 coding->head_ascii = 0;
6497 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6498 for (src = coding->source; src < src_end; src++)
6500 c = *src;
6501 if (c & 0x80)
6503 eight_bit_found = 1;
6504 if (null_byte_found)
6505 break;
6507 else if (c < 0x20)
6509 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6510 && ! inhibit_ied
6511 && ! detect_info.checked)
6513 if (detect_coding_iso_2022 (coding, &detect_info))
6515 /* We have scanned the whole data. */
6516 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6518 /* We didn't find an 8-bit code. We may
6519 have found a null-byte, but it's very
6520 rare that a binary file conforms to
6521 ISO-2022. */
6522 src = src_end;
6523 coding->head_ascii = src - coding->source;
6525 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6526 break;
6529 else if (! c && !inhibit_nbd)
6531 null_byte_found = 1;
6532 if (eight_bit_found)
6533 break;
6535 else if (! disable_ascii_optimization
6536 && ! inhibit_eol_conversion)
6538 if (c == '\r')
6540 if (src < src_end && src[1] == '\n')
6542 coding->eol_seen |= EOL_SEEN_CRLF;
6543 src++;
6544 if (! eight_bit_found)
6545 coding->head_ascii++;
6547 else
6548 coding->eol_seen |= EOL_SEEN_CR;
6550 else if (c == '\n')
6552 coding->eol_seen |= EOL_SEEN_LF;
6556 if (! eight_bit_found)
6557 coding->head_ascii++;
6559 else if (! eight_bit_found)
6560 coding->head_ascii++;
6563 if (null_byte_found || eight_bit_found
6564 || coding->head_ascii < coding->src_bytes
6565 || detect_info.found)
6567 enum coding_category category;
6568 struct coding_system *this;
6570 if (coding->head_ascii == coding->src_bytes)
6571 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6572 for (i = 0; i < coding_category_raw_text; i++)
6574 category = coding_priorities[i];
6575 this = coding_categories + category;
6576 if (detect_info.found & (1 << category))
6577 break;
6579 else
6581 if (null_byte_found)
6583 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6584 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6586 else if (prefer_utf_8
6587 && detect_coding_utf_8 (coding, &detect_info))
6589 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6590 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6592 for (i = 0; i < coding_category_raw_text; i++)
6594 category = coding_priorities[i];
6595 this = coding_categories + category;
6596 /* Some of this->detector (e.g. detect_coding_sjis)
6597 require this information. */
6598 coding->id = this->id;
6599 if (this->id < 0)
6601 /* No coding system of this category is defined. */
6602 detect_info.rejected |= (1 << category);
6604 else if (category >= coding_category_raw_text)
6605 continue;
6606 else if (detect_info.checked & (1 << category))
6608 if (detect_info.found & (1 << category))
6609 break;
6611 else if ((*(this->detector)) (coding, &detect_info)
6612 && detect_info.found & (1 << category))
6613 break;
6617 if (i < coding_category_raw_text)
6619 if (category == coding_category_utf_8_auto)
6621 Lisp_Object coding_systems;
6623 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6624 coding_attr_utf_bom);
6625 if (CONSP (coding_systems))
6627 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6628 found = XCAR (coding_systems);
6629 else
6630 found = XCDR (coding_systems);
6632 else
6633 found = CODING_ID_NAME (this->id);
6635 else if (category == coding_category_utf_16_auto)
6637 Lisp_Object coding_systems;
6639 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6640 coding_attr_utf_bom);
6641 if (CONSP (coding_systems))
6643 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6644 found = XCAR (coding_systems);
6645 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6646 found = XCDR (coding_systems);
6648 else
6649 found = CODING_ID_NAME (this->id);
6651 else
6652 found = CODING_ID_NAME (this->id);
6654 else if (null_byte_found)
6655 found = Qno_conversion;
6656 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6657 == CATEGORY_MASK_ANY)
6658 found = Qraw_text;
6659 else if (detect_info.rejected)
6660 for (i = 0; i < coding_category_raw_text; i++)
6661 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6663 this = coding_categories + coding_priorities[i];
6664 found = CODING_ID_NAME (this->id);
6665 break;
6669 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6670 == coding_category_utf_8_auto)
6672 Lisp_Object coding_systems;
6673 struct coding_detection_info detect_info;
6675 coding_systems
6676 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6677 detect_info.found = detect_info.rejected = 0;
6678 if (check_ascii (coding) == coding->src_bytes)
6680 if (CONSP (coding_systems))
6681 found = XCDR (coding_systems);
6683 else
6685 if (CONSP (coding_systems)
6686 && detect_coding_utf_8 (coding, &detect_info))
6688 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6689 found = XCAR (coding_systems);
6690 else
6691 found = XCDR (coding_systems);
6695 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6696 == coding_category_utf_16_auto)
6698 Lisp_Object coding_systems;
6699 struct coding_detection_info detect_info;
6701 coding_systems
6702 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6703 detect_info.found = detect_info.rejected = 0;
6704 coding->head_ascii = 0;
6705 if (CONSP (coding_systems)
6706 && detect_coding_utf_16 (coding, &detect_info))
6708 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6709 found = XCAR (coding_systems);
6710 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6711 found = XCDR (coding_systems);
6715 if (! NILP (found))
6717 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6718 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6719 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6720 : EOL_SEEN_LF);
6722 setup_coding_system (found, coding);
6723 if (specified_eol != EOL_SEEN_NONE)
6724 adjust_coding_eol_type (coding, specified_eol);
6727 coding->mode = saved_mode;
6731 static void
6732 decode_eol (struct coding_system *coding)
6734 Lisp_Object eol_type;
6735 unsigned char *p, *pbeg, *pend;
6737 eol_type = CODING_ID_EOL_TYPE (coding->id);
6738 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6739 return;
6741 if (NILP (coding->dst_object))
6742 pbeg = coding->destination;
6743 else
6744 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6745 pend = pbeg + coding->produced;
6747 if (VECTORP (eol_type))
6749 int eol_seen = EOL_SEEN_NONE;
6751 for (p = pbeg; p < pend; p++)
6753 if (*p == '\n')
6754 eol_seen |= EOL_SEEN_LF;
6755 else if (*p == '\r')
6757 if (p + 1 < pend && *(p + 1) == '\n')
6759 eol_seen |= EOL_SEEN_CRLF;
6760 p++;
6762 else
6763 eol_seen |= EOL_SEEN_CR;
6766 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6767 if ((eol_seen & EOL_SEEN_CRLF) != 0
6768 && (eol_seen & EOL_SEEN_CR) != 0
6769 && (eol_seen & EOL_SEEN_LF) == 0)
6770 eol_seen = EOL_SEEN_CRLF;
6771 else if (eol_seen != EOL_SEEN_NONE
6772 && eol_seen != EOL_SEEN_LF
6773 && eol_seen != EOL_SEEN_CRLF
6774 && eol_seen != EOL_SEEN_CR)
6775 eol_seen = EOL_SEEN_LF;
6776 if (eol_seen != EOL_SEEN_NONE)
6777 eol_type = adjust_coding_eol_type (coding, eol_seen);
6780 if (EQ (eol_type, Qmac))
6782 for (p = pbeg; p < pend; p++)
6783 if (*p == '\r')
6784 *p = '\n';
6786 else if (EQ (eol_type, Qdos))
6788 ptrdiff_t n = 0;
6790 if (NILP (coding->dst_object))
6792 /* Start deleting '\r' from the tail to minimize the memory
6793 movement. */
6794 for (p = pend - 2; p >= pbeg; p--)
6795 if (*p == '\r')
6797 memmove (p, p + 1, pend-- - p - 1);
6798 n++;
6801 else
6803 ptrdiff_t pos_byte = coding->dst_pos_byte;
6804 ptrdiff_t pos = coding->dst_pos;
6805 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6807 while (pos < pos_end)
6809 p = BYTE_POS_ADDR (pos_byte);
6810 if (*p == '\r' && p[1] == '\n')
6812 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6813 n++;
6814 pos_end--;
6816 pos++;
6817 if (coding->dst_multibyte)
6818 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6819 else
6820 pos_byte++;
6823 coding->produced -= n;
6824 coding->produced_char -= n;
6829 /* Return a translation table (or list of them) from coding system
6830 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6831 not ENCODEP). */
6833 static Lisp_Object
6834 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6836 Lisp_Object standard, translation_table;
6837 Lisp_Object val;
6839 if (NILP (Venable_character_translation))
6841 if (max_lookup)
6842 *max_lookup = 0;
6843 return Qnil;
6845 if (encodep)
6846 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6847 standard = Vstandard_translation_table_for_encode;
6848 else
6849 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6850 standard = Vstandard_translation_table_for_decode;
6851 if (NILP (translation_table))
6852 translation_table = standard;
6853 else
6855 if (SYMBOLP (translation_table))
6856 translation_table = Fget (translation_table, Qtranslation_table);
6857 else if (CONSP (translation_table))
6859 translation_table = Fcopy_sequence (translation_table);
6860 for (val = translation_table; CONSP (val); val = XCDR (val))
6861 if (SYMBOLP (XCAR (val)))
6862 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6864 if (CHAR_TABLE_P (standard))
6866 if (CONSP (translation_table))
6867 translation_table = nconc2 (translation_table, list1 (standard));
6868 else
6869 translation_table = list2 (translation_table, standard);
6873 if (max_lookup)
6875 *max_lookup = 1;
6876 if (CHAR_TABLE_P (translation_table)
6877 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6879 val = XCHAR_TABLE (translation_table)->extras[1];
6880 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6881 *max_lookup = XFASTINT (val);
6883 else if (CONSP (translation_table))
6885 Lisp_Object tail;
6887 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6888 if (CHAR_TABLE_P (XCAR (tail))
6889 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6891 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6892 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6893 *max_lookup = XFASTINT (tailval);
6897 return translation_table;
6900 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6901 do { \
6902 trans = Qnil; \
6903 if (CHAR_TABLE_P (table)) \
6905 trans = CHAR_TABLE_REF (table, c); \
6906 if (CHARACTERP (trans)) \
6907 c = XFASTINT (trans), trans = Qnil; \
6909 else if (CONSP (table)) \
6911 Lisp_Object tail; \
6913 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6914 if (CHAR_TABLE_P (XCAR (tail))) \
6916 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6917 if (CHARACTERP (trans)) \
6918 c = XFASTINT (trans), trans = Qnil; \
6919 else if (! NILP (trans)) \
6920 break; \
6923 } while (0)
6926 /* Return a translation of character(s) at BUF according to TRANS.
6927 TRANS is TO-CHAR or ((FROM . TO) ...) where
6928 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6929 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6930 translation is found, and Qnil if not found..
6931 If BUF is too short to lookup characters in FROM, return Qt. */
6933 static Lisp_Object
6934 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6937 if (INTEGERP (trans))
6938 return trans;
6939 for (; CONSP (trans); trans = XCDR (trans))
6941 Lisp_Object val = XCAR (trans);
6942 Lisp_Object from = XCAR (val);
6943 ptrdiff_t len = ASIZE (from);
6944 ptrdiff_t i;
6946 for (i = 0; i < len; i++)
6948 if (buf + i == buf_end)
6949 return Qt;
6950 if (XINT (AREF (from, i)) != buf[i])
6951 break;
6953 if (i == len)
6954 return val;
6956 return Qnil;
6960 static int
6961 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6962 bool last_block)
6964 unsigned char *dst = coding->destination + coding->produced;
6965 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6966 ptrdiff_t produced;
6967 ptrdiff_t produced_chars = 0;
6968 int carryover = 0;
6970 if (! coding->chars_at_source)
6972 /* Source characters are in coding->charbuf. */
6973 int *buf = coding->charbuf;
6974 int *buf_end = buf + coding->charbuf_used;
6976 if (EQ (coding->src_object, coding->dst_object))
6978 coding_set_source (coding);
6979 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6982 while (buf < buf_end)
6984 int c = *buf;
6985 ptrdiff_t i;
6987 if (c >= 0)
6989 ptrdiff_t from_nchars = 1, to_nchars = 1;
6990 Lisp_Object trans = Qnil;
6992 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6993 if (! NILP (trans))
6995 trans = get_translation (trans, buf, buf_end);
6996 if (INTEGERP (trans))
6997 c = XINT (trans);
6998 else if (CONSP (trans))
7000 from_nchars = ASIZE (XCAR (trans));
7001 trans = XCDR (trans);
7002 if (INTEGERP (trans))
7003 c = XINT (trans);
7004 else
7006 to_nchars = ASIZE (trans);
7007 c = XINT (AREF (trans, 0));
7010 else if (EQ (trans, Qt) && ! last_block)
7011 break;
7014 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7016 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7017 / MAX_MULTIBYTE_LENGTH)
7018 < to_nchars)
7019 memory_full (SIZE_MAX);
7020 dst = alloc_destination (coding,
7021 buf_end - buf
7022 + MAX_MULTIBYTE_LENGTH * to_nchars,
7023 dst);
7024 if (EQ (coding->src_object, coding->dst_object))
7026 coding_set_source (coding);
7027 dst_end = (((unsigned char *) coding->source)
7028 + coding->consumed);
7030 else
7031 dst_end = coding->destination + coding->dst_bytes;
7034 for (i = 0; i < to_nchars; i++)
7036 if (i > 0)
7037 c = XINT (AREF (trans, i));
7038 if (coding->dst_multibyte
7039 || ! CHAR_BYTE8_P (c))
7040 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7041 else
7042 *dst++ = CHAR_TO_BYTE8 (c);
7044 produced_chars += to_nchars;
7045 buf += from_nchars;
7047 else
7048 /* This is an annotation datum. (-C) is the length. */
7049 buf += -c;
7051 carryover = buf_end - buf;
7053 else
7055 /* Source characters are at coding->source. */
7056 const unsigned char *src = coding->source;
7057 const unsigned char *src_end = src + coding->consumed;
7059 if (EQ (coding->dst_object, coding->src_object))
7060 dst_end = (unsigned char *) src;
7061 if (coding->src_multibyte != coding->dst_multibyte)
7063 if (coding->src_multibyte)
7065 bool multibytep = 1;
7066 ptrdiff_t consumed_chars = 0;
7068 while (1)
7070 const unsigned char *src_base = src;
7071 int c;
7073 ONE_MORE_BYTE (c);
7074 if (dst == dst_end)
7076 if (EQ (coding->src_object, coding->dst_object))
7077 dst_end = (unsigned char *) src;
7078 if (dst == dst_end)
7080 ptrdiff_t offset = src - coding->source;
7082 dst = alloc_destination (coding, src_end - src + 1,
7083 dst);
7084 dst_end = coding->destination + coding->dst_bytes;
7085 coding_set_source (coding);
7086 src = coding->source + offset;
7087 src_end = coding->source + coding->consumed;
7088 if (EQ (coding->src_object, coding->dst_object))
7089 dst_end = (unsigned char *) src;
7092 *dst++ = c;
7093 produced_chars++;
7095 no_more_source:
7098 else
7099 while (src < src_end)
7101 bool multibytep = 1;
7102 int c = *src++;
7104 if (dst >= dst_end - 1)
7106 if (EQ (coding->src_object, coding->dst_object))
7107 dst_end = (unsigned char *) src;
7108 if (dst >= dst_end - 1)
7110 ptrdiff_t offset = src - coding->source;
7111 ptrdiff_t more_bytes;
7113 if (EQ (coding->src_object, coding->dst_object))
7114 more_bytes = ((src_end - src) / 2) + 2;
7115 else
7116 more_bytes = src_end - src + 2;
7117 dst = alloc_destination (coding, more_bytes, dst);
7118 dst_end = coding->destination + coding->dst_bytes;
7119 coding_set_source (coding);
7120 src = coding->source + offset;
7121 src_end = coding->source + coding->consumed;
7122 if (EQ (coding->src_object, coding->dst_object))
7123 dst_end = (unsigned char *) src;
7126 EMIT_ONE_BYTE (c);
7129 else
7131 if (!EQ (coding->src_object, coding->dst_object))
7133 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7135 if (require > 0)
7137 ptrdiff_t offset = src - coding->source;
7139 dst = alloc_destination (coding, require, dst);
7140 coding_set_source (coding);
7141 src = coding->source + offset;
7142 src_end = coding->source + coding->consumed;
7145 produced_chars = coding->consumed_char;
7146 while (src < src_end)
7147 *dst++ = *src++;
7151 produced = dst - (coding->destination + coding->produced);
7152 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7153 insert_from_gap (produced_chars, produced, 0);
7154 coding->produced += produced;
7155 coding->produced_char += produced_chars;
7156 return carryover;
7159 /* Compose text in CODING->object according to the annotation data at
7160 CHARBUF. CHARBUF is an array:
7161 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7164 static void
7165 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7167 int len;
7168 ptrdiff_t to;
7169 enum composition_method method;
7170 Lisp_Object components;
7172 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7173 to = pos + charbuf[2];
7174 method = (enum composition_method) (charbuf[4]);
7176 if (method == COMPOSITION_RELATIVE)
7177 components = Qnil;
7178 else
7180 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7181 int i, j;
7183 if (method == COMPOSITION_WITH_RULE)
7184 len = charbuf[2] * 3 - 2;
7185 charbuf += MAX_ANNOTATION_LENGTH;
7186 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7187 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7189 if (charbuf[i] >= 0)
7190 args[j] = make_number (charbuf[i]);
7191 else
7193 i++;
7194 args[j] = make_number (charbuf[i] % 0x100);
7197 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7199 compose_text (pos, to, components, Qnil, coding->dst_object);
7203 /* Put `charset' property on text in CODING->object according to
7204 the annotation data at CHARBUF. CHARBUF is an array:
7205 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7208 static void
7209 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7211 ptrdiff_t from = pos - charbuf[2];
7212 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7214 Fput_text_property (make_number (from), make_number (pos),
7215 Qcharset, CHARSET_NAME (charset),
7216 coding->dst_object);
7220 #define CHARBUF_SIZE 0x4000
7222 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7223 do { \
7224 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7225 coding->charbuf_size = CHARBUF_SIZE; \
7226 } while (0)
7229 static void
7230 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7232 int *charbuf = coding->charbuf;
7233 int *charbuf_end = charbuf + coding->charbuf_used;
7235 if (NILP (coding->dst_object))
7236 return;
7238 while (charbuf < charbuf_end)
7240 if (*charbuf >= 0)
7241 pos++, charbuf++;
7242 else
7244 int len = -*charbuf;
7246 if (len > 2)
7247 switch (charbuf[1])
7249 case CODING_ANNOTATE_COMPOSITION_MASK:
7250 produce_composition (coding, charbuf, pos);
7251 break;
7252 case CODING_ANNOTATE_CHARSET_MASK:
7253 produce_charset (coding, charbuf, pos);
7254 break;
7256 charbuf += len;
7261 /* Decode the data at CODING->src_object into CODING->dst_object.
7262 CODING->src_object is a buffer, a string, or nil.
7263 CODING->dst_object is a buffer.
7265 If CODING->src_object is a buffer, it must be the current buffer.
7266 In this case, if CODING->src_pos is positive, it is a position of
7267 the source text in the buffer, otherwise, the source text is in the
7268 gap area of the buffer, and CODING->src_pos specifies the offset of
7269 the text from GPT (which must be the same as PT). If this is the
7270 same buffer as CODING->dst_object, CODING->src_pos must be
7271 negative.
7273 If CODING->src_object is a string, CODING->src_pos is an index to
7274 that string.
7276 If CODING->src_object is nil, CODING->source must already point to
7277 the non-relocatable memory area. In this case, CODING->src_pos is
7278 an offset from CODING->source.
7280 The decoded data is inserted at the current point of the buffer
7281 CODING->dst_object.
7284 static void
7285 decode_coding (struct coding_system *coding)
7287 Lisp_Object attrs;
7288 Lisp_Object undo_list;
7289 Lisp_Object translation_table;
7290 struct ccl_spec cclspec;
7291 int carryover;
7292 int i;
7294 USE_SAFE_ALLOCA;
7296 if (BUFFERP (coding->src_object)
7297 && coding->src_pos > 0
7298 && coding->src_pos < GPT
7299 && coding->src_pos + coding->src_chars > GPT)
7300 move_gap_both (coding->src_pos, coding->src_pos_byte);
7302 undo_list = Qt;
7303 if (BUFFERP (coding->dst_object))
7305 set_buffer_internal (XBUFFER (coding->dst_object));
7306 if (GPT != PT)
7307 move_gap_both (PT, PT_BYTE);
7309 /* We must disable undo_list in order to record the whole insert
7310 transaction via record_insert at the end. But doing so also
7311 disables the recording of the first change to the undo_list.
7312 Therefore we check for first change here and record it via
7313 record_first_change if needed. */
7314 if (MODIFF <= SAVE_MODIFF)
7315 record_first_change ();
7317 undo_list = BVAR (current_buffer, undo_list);
7318 bset_undo_list (current_buffer, Qt);
7321 coding->consumed = coding->consumed_char = 0;
7322 coding->produced = coding->produced_char = 0;
7323 coding->chars_at_source = 0;
7324 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7325 coding->errors = 0;
7327 ALLOC_CONVERSION_WORK_AREA (coding);
7329 attrs = CODING_ID_ATTRS (coding->id);
7330 translation_table = get_translation_table (attrs, 0, NULL);
7332 carryover = 0;
7333 if (coding->decoder == decode_coding_ccl)
7335 coding->spec.ccl = &cclspec;
7336 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7340 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7342 coding_set_source (coding);
7343 coding->annotated = 0;
7344 coding->charbuf_used = carryover;
7345 (*(coding->decoder)) (coding);
7346 coding_set_destination (coding);
7347 carryover = produce_chars (coding, translation_table, 0);
7348 if (coding->annotated)
7349 produce_annotation (coding, pos);
7350 for (i = 0; i < carryover; i++)
7351 coding->charbuf[i]
7352 = coding->charbuf[coding->charbuf_used - carryover + i];
7354 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7355 || (coding->consumed < coding->src_bytes
7356 && (coding->result == CODING_RESULT_SUCCESS
7357 || coding->result == CODING_RESULT_INVALID_SRC)));
7359 if (carryover > 0)
7361 coding_set_destination (coding);
7362 coding->charbuf_used = carryover;
7363 produce_chars (coding, translation_table, 1);
7366 coding->carryover_bytes = 0;
7367 if (coding->consumed < coding->src_bytes)
7369 int nbytes = coding->src_bytes - coding->consumed;
7370 const unsigned char *src;
7372 coding_set_source (coding);
7373 coding_set_destination (coding);
7374 src = coding->source + coding->consumed;
7376 if (coding->mode & CODING_MODE_LAST_BLOCK)
7378 /* Flush out unprocessed data as binary chars. We are sure
7379 that the number of data is less than the size of
7380 coding->charbuf. */
7381 coding->charbuf_used = 0;
7382 coding->chars_at_source = 0;
7384 while (nbytes-- > 0)
7386 int c = *src++;
7388 if (c & 0x80)
7389 c = BYTE8_TO_CHAR (c);
7390 coding->charbuf[coding->charbuf_used++] = c;
7392 produce_chars (coding, Qnil, 1);
7394 else
7396 /* Record unprocessed bytes in coding->carryover. We are
7397 sure that the number of data is less than the size of
7398 coding->carryover. */
7399 unsigned char *p = coding->carryover;
7401 if (nbytes > sizeof coding->carryover)
7402 nbytes = sizeof coding->carryover;
7403 coding->carryover_bytes = nbytes;
7404 while (nbytes-- > 0)
7405 *p++ = *src++;
7407 coding->consumed = coding->src_bytes;
7410 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7411 && !inhibit_eol_conversion)
7412 decode_eol (coding);
7413 if (BUFFERP (coding->dst_object))
7415 bset_undo_list (current_buffer, undo_list);
7416 record_insert (coding->dst_pos, coding->produced_char);
7419 SAFE_FREE ();
7423 /* Extract an annotation datum from a composition starting at POS and
7424 ending before LIMIT of CODING->src_object (buffer or string), store
7425 the data in BUF, set *STOP to a starting position of the next
7426 composition (if any) or to LIMIT, and return the address of the
7427 next element of BUF.
7429 If such an annotation is not found, set *STOP to a starting
7430 position of a composition after POS (if any) or to LIMIT, and
7431 return BUF. */
7433 static int *
7434 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7435 struct coding_system *coding, int *buf,
7436 ptrdiff_t *stop)
7438 ptrdiff_t start, end;
7439 Lisp_Object prop;
7441 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7442 || end > limit)
7443 *stop = limit;
7444 else if (start > pos)
7445 *stop = start;
7446 else
7448 if (start == pos)
7450 /* We found a composition. Store the corresponding
7451 annotation data in BUF. */
7452 int *head = buf;
7453 enum composition_method method = COMPOSITION_METHOD (prop);
7454 int nchars = COMPOSITION_LENGTH (prop);
7456 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7457 if (method != COMPOSITION_RELATIVE)
7459 Lisp_Object components;
7460 ptrdiff_t i, len, i_byte;
7462 components = COMPOSITION_COMPONENTS (prop);
7463 if (VECTORP (components))
7465 len = ASIZE (components);
7466 for (i = 0; i < len; i++)
7467 *buf++ = XINT (AREF (components, i));
7469 else if (STRINGP (components))
7471 len = SCHARS (components);
7472 i = i_byte = 0;
7473 while (i < len)
7475 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7476 buf++;
7479 else if (INTEGERP (components))
7481 len = 1;
7482 *buf++ = XINT (components);
7484 else if (CONSP (components))
7486 for (len = 0; CONSP (components);
7487 len++, components = XCDR (components))
7488 *buf++ = XINT (XCAR (components));
7490 else
7491 emacs_abort ();
7492 *head -= len;
7496 if (find_composition (end, limit, &start, &end, &prop,
7497 coding->src_object)
7498 && end <= limit)
7499 *stop = start;
7500 else
7501 *stop = limit;
7503 return buf;
7507 /* Extract an annotation datum from a text property `charset' at POS of
7508 CODING->src_object (buffer of string), store the data in BUF, set
7509 *STOP to the position where the value of `charset' property changes
7510 (limiting by LIMIT), and return the address of the next element of
7511 BUF.
7513 If the property value is nil, set *STOP to the position where the
7514 property value is non-nil (limiting by LIMIT), and return BUF. */
7516 static int *
7517 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7518 struct coding_system *coding, int *buf,
7519 ptrdiff_t *stop)
7521 Lisp_Object val, next;
7522 int id;
7524 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7525 if (! NILP (val) && CHARSETP (val))
7526 id = XINT (CHARSET_SYMBOL_ID (val));
7527 else
7528 id = -1;
7529 ADD_CHARSET_DATA (buf, 0, id);
7530 next = Fnext_single_property_change (make_number (pos), Qcharset,
7531 coding->src_object,
7532 make_number (limit));
7533 *stop = XINT (next);
7534 return buf;
7538 static void
7539 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7540 int max_lookup)
7542 int *buf = coding->charbuf;
7543 int *buf_end = coding->charbuf + coding->charbuf_size;
7544 const unsigned char *src = coding->source + coding->consumed;
7545 const unsigned char *src_end = coding->source + coding->src_bytes;
7546 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7547 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7548 bool multibytep = coding->src_multibyte;
7549 Lisp_Object eol_type;
7550 int c;
7551 ptrdiff_t stop, stop_composition, stop_charset;
7552 int *lookup_buf = NULL;
7554 if (! NILP (translation_table))
7555 lookup_buf = alloca (sizeof (int) * max_lookup);
7557 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7558 if (VECTORP (eol_type))
7559 eol_type = Qunix;
7561 /* Note: composition handling is not yet implemented. */
7562 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7564 if (NILP (coding->src_object))
7565 stop = stop_composition = stop_charset = end_pos;
7566 else
7568 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7569 stop = stop_composition = pos;
7570 else
7571 stop = stop_composition = end_pos;
7572 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7573 stop = stop_charset = pos;
7574 else
7575 stop_charset = end_pos;
7578 /* Compensate for CRLF and conversion. */
7579 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7580 while (buf < buf_end)
7582 Lisp_Object trans;
7584 if (pos == stop)
7586 if (pos == end_pos)
7587 break;
7588 if (pos == stop_composition)
7589 buf = handle_composition_annotation (pos, end_pos, coding,
7590 buf, &stop_composition);
7591 if (pos == stop_charset)
7592 buf = handle_charset_annotation (pos, end_pos, coding,
7593 buf, &stop_charset);
7594 stop = (stop_composition < stop_charset
7595 ? stop_composition : stop_charset);
7598 if (! multibytep)
7600 int bytes;
7602 if (coding->encoder == encode_coding_raw_text
7603 || coding->encoder == encode_coding_ccl)
7604 c = *src++, pos++;
7605 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7606 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7607 else
7608 c = BYTE8_TO_CHAR (*src), src++, pos++;
7610 else
7611 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7612 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7613 c = '\n';
7614 if (! EQ (eol_type, Qunix))
7616 if (c == '\n')
7618 if (EQ (eol_type, Qdos))
7619 *buf++ = '\r';
7620 else
7621 c = '\r';
7625 trans = Qnil;
7626 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7627 if (NILP (trans))
7628 *buf++ = c;
7629 else
7631 ptrdiff_t from_nchars = 1, to_nchars = 1;
7632 int *lookup_buf_end;
7633 const unsigned char *p = src;
7634 int i;
7636 lookup_buf[0] = c;
7637 for (i = 1; i < max_lookup && p < src_end; i++)
7638 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7639 lookup_buf_end = lookup_buf + i;
7640 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7641 if (INTEGERP (trans))
7642 c = XINT (trans);
7643 else if (CONSP (trans))
7645 from_nchars = ASIZE (XCAR (trans));
7646 trans = XCDR (trans);
7647 if (INTEGERP (trans))
7648 c = XINT (trans);
7649 else
7651 to_nchars = ASIZE (trans);
7652 if (buf_end - buf < to_nchars)
7653 break;
7654 c = XINT (AREF (trans, 0));
7657 else
7658 break;
7659 *buf++ = c;
7660 for (i = 1; i < to_nchars; i++)
7661 *buf++ = XINT (AREF (trans, i));
7662 for (i = 1; i < from_nchars; i++, pos++)
7663 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7667 coding->consumed = src - coding->source;
7668 coding->consumed_char = pos - coding->src_pos;
7669 coding->charbuf_used = buf - coding->charbuf;
7670 coding->chars_at_source = 0;
7674 /* Encode the text at CODING->src_object into CODING->dst_object.
7675 CODING->src_object is a buffer or a string.
7676 CODING->dst_object is a buffer or nil.
7678 If CODING->src_object is a buffer, it must be the current buffer.
7679 In this case, if CODING->src_pos is positive, it is a position of
7680 the source text in the buffer, otherwise. the source text is in the
7681 gap area of the buffer, and coding->src_pos specifies the offset of
7682 the text from GPT (which must be the same as PT). If this is the
7683 same buffer as CODING->dst_object, CODING->src_pos must be
7684 negative and CODING should not have `pre-write-conversion'.
7686 If CODING->src_object is a string, CODING should not have
7687 `pre-write-conversion'.
7689 If CODING->dst_object is a buffer, the encoded data is inserted at
7690 the current point of that buffer.
7692 If CODING->dst_object is nil, the encoded data is placed at the
7693 memory area specified by CODING->destination. */
7695 static void
7696 encode_coding (struct coding_system *coding)
7698 Lisp_Object attrs;
7699 Lisp_Object translation_table;
7700 int max_lookup;
7701 struct ccl_spec cclspec;
7703 USE_SAFE_ALLOCA;
7705 attrs = CODING_ID_ATTRS (coding->id);
7706 if (coding->encoder == encode_coding_raw_text)
7707 translation_table = Qnil, max_lookup = 0;
7708 else
7709 translation_table = get_translation_table (attrs, 1, &max_lookup);
7711 if (BUFFERP (coding->dst_object))
7713 set_buffer_internal (XBUFFER (coding->dst_object));
7714 coding->dst_multibyte
7715 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7718 coding->consumed = coding->consumed_char = 0;
7719 coding->produced = coding->produced_char = 0;
7720 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7721 coding->errors = 0;
7723 ALLOC_CONVERSION_WORK_AREA (coding);
7725 if (coding->encoder == encode_coding_ccl)
7727 coding->spec.ccl = &cclspec;
7728 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7730 do {
7731 coding_set_source (coding);
7732 consume_chars (coding, translation_table, max_lookup);
7733 coding_set_destination (coding);
7734 (*(coding->encoder)) (coding);
7735 } while (coding->consumed_char < coding->src_chars);
7737 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7738 insert_from_gap (coding->produced_char, coding->produced, 0);
7740 SAFE_FREE ();
7744 /* Name (or base name) of work buffer for code conversion. */
7745 static Lisp_Object Vcode_conversion_workbuf_name;
7747 /* A working buffer used by the top level conversion. Once it is
7748 created, it is never destroyed. It has the name
7749 Vcode_conversion_workbuf_name. The other working buffers are
7750 destroyed after the use is finished, and their names are modified
7751 versions of Vcode_conversion_workbuf_name. */
7752 static Lisp_Object Vcode_conversion_reused_workbuf;
7754 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7755 static bool reused_workbuf_in_use;
7758 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7759 multibyteness of returning buffer. */
7761 static Lisp_Object
7762 make_conversion_work_buffer (bool multibyte)
7764 Lisp_Object name, workbuf;
7765 struct buffer *current;
7767 if (reused_workbuf_in_use)
7769 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7770 workbuf = Fget_buffer_create (name);
7772 else
7774 reused_workbuf_in_use = 1;
7775 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7776 Vcode_conversion_reused_workbuf
7777 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7778 workbuf = Vcode_conversion_reused_workbuf;
7780 current = current_buffer;
7781 set_buffer_internal (XBUFFER (workbuf));
7782 /* We can't allow modification hooks to run in the work buffer. For
7783 instance, directory_files_internal assumes that file decoding
7784 doesn't compile new regexps. */
7785 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7786 Ferase_buffer ();
7787 bset_undo_list (current_buffer, Qt);
7788 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7789 set_buffer_internal (current);
7790 return workbuf;
7794 static void
7795 code_conversion_restore (Lisp_Object arg)
7797 Lisp_Object current, workbuf;
7798 struct gcpro gcpro1;
7800 GCPRO1 (arg);
7801 current = XCAR (arg);
7802 workbuf = XCDR (arg);
7803 if (! NILP (workbuf))
7805 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7806 reused_workbuf_in_use = 0;
7807 else
7808 Fkill_buffer (workbuf);
7810 set_buffer_internal (XBUFFER (current));
7811 UNGCPRO;
7814 Lisp_Object
7815 code_conversion_save (bool with_work_buf, bool multibyte)
7817 Lisp_Object workbuf = Qnil;
7819 if (with_work_buf)
7820 workbuf = make_conversion_work_buffer (multibyte);
7821 record_unwind_protect (code_conversion_restore,
7822 Fcons (Fcurrent_buffer (), workbuf));
7823 return workbuf;
7826 void
7827 decode_coding_gap (struct coding_system *coding,
7828 ptrdiff_t chars, ptrdiff_t bytes)
7830 ptrdiff_t count = SPECPDL_INDEX ();
7831 Lisp_Object attrs;
7833 coding->src_object = Fcurrent_buffer ();
7834 coding->src_chars = chars;
7835 coding->src_bytes = bytes;
7836 coding->src_pos = -chars;
7837 coding->src_pos_byte = -bytes;
7838 coding->src_multibyte = chars < bytes;
7839 coding->dst_object = coding->src_object;
7840 coding->dst_pos = PT;
7841 coding->dst_pos_byte = PT_BYTE;
7842 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7844 coding->head_ascii = -1;
7845 coding->detected_utf8_chars = -1;
7846 coding->eol_seen = EOL_SEEN_NONE;
7847 if (CODING_REQUIRE_DETECTION (coding))
7848 detect_coding (coding);
7849 attrs = CODING_ID_ATTRS (coding->id);
7850 if (! disable_ascii_optimization
7851 && ! coding->src_multibyte
7852 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7853 && NILP (CODING_ATTR_POST_READ (attrs))
7854 && NILP (get_translation_table (attrs, 0, NULL)))
7856 chars = coding->head_ascii;
7857 if (chars < 0)
7858 chars = check_ascii (coding);
7859 if (chars != bytes)
7861 /* There exists a non-ASCII byte. */
7862 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7864 if (coding->detected_utf8_chars >= 0)
7865 chars = coding->detected_utf8_chars;
7866 else
7867 chars = check_utf_8 (coding);
7868 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7869 && coding->head_ascii == 0
7870 && coding->source[0] == UTF_8_BOM_1
7871 && coding->source[1] == UTF_8_BOM_2
7872 && coding->source[2] == UTF_8_BOM_3)
7874 chars--;
7875 bytes -= 3;
7876 coding->src_bytes -= 3;
7879 else
7880 chars = -1;
7882 if (chars >= 0)
7884 Lisp_Object eol_type;
7886 eol_type = CODING_ID_EOL_TYPE (coding->id);
7887 if (VECTORP (eol_type))
7889 if (coding->eol_seen != EOL_SEEN_NONE)
7890 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7892 if (EQ (eol_type, Qmac))
7894 unsigned char *src_end = GAP_END_ADDR;
7895 unsigned char *src = src_end - coding->src_bytes;
7897 while (src < src_end)
7899 if (*src++ == '\r')
7900 src[-1] = '\n';
7903 else if (EQ (eol_type, Qdos))
7905 unsigned char *src = GAP_END_ADDR;
7906 unsigned char *src_beg = src - coding->src_bytes;
7907 unsigned char *dst = src;
7908 ptrdiff_t diff;
7910 while (src_beg < src)
7912 *--dst = *--src;
7913 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7914 src--;
7916 diff = dst - src;
7917 bytes -= diff;
7918 chars -= diff;
7920 coding->produced = bytes;
7921 coding->produced_char = chars;
7922 insert_from_gap (chars, bytes, 1);
7923 return;
7926 code_conversion_save (0, 0);
7928 coding->mode |= CODING_MODE_LAST_BLOCK;
7929 current_buffer->text->inhibit_shrinking = 1;
7930 decode_coding (coding);
7931 current_buffer->text->inhibit_shrinking = 0;
7933 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7935 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7936 Lisp_Object val;
7938 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7939 val = call1 (CODING_ATTR_POST_READ (attrs),
7940 make_number (coding->produced_char));
7941 CHECK_NATNUM (val);
7942 coding->produced_char += Z - prev_Z;
7943 coding->produced += Z_BYTE - prev_Z_BYTE;
7946 unbind_to (count, Qnil);
7950 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7951 SRC_OBJECT into DST_OBJECT by coding context CODING.
7953 SRC_OBJECT is a buffer, a string, or Qnil.
7955 If it is a buffer, the text is at point of the buffer. FROM and TO
7956 are positions in the buffer.
7958 If it is a string, the text is at the beginning of the string.
7959 FROM and TO are indices to the string.
7961 If it is nil, the text is at coding->source. FROM and TO are
7962 indices to coding->source.
7964 DST_OBJECT is a buffer, Qt, or Qnil.
7966 If it is a buffer, the decoded text is inserted at point of the
7967 buffer. If the buffer is the same as SRC_OBJECT, the source text
7968 is deleted.
7970 If it is Qt, a string is made from the decoded text, and
7971 set in CODING->dst_object.
7973 If it is Qnil, the decoded text is stored at CODING->destination.
7974 The caller must allocate CODING->dst_bytes bytes at
7975 CODING->destination by xmalloc. If the decoded text is longer than
7976 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7979 void
7980 decode_coding_object (struct coding_system *coding,
7981 Lisp_Object src_object,
7982 ptrdiff_t from, ptrdiff_t from_byte,
7983 ptrdiff_t to, ptrdiff_t to_byte,
7984 Lisp_Object dst_object)
7986 ptrdiff_t count = SPECPDL_INDEX ();
7987 unsigned char *destination IF_LINT (= NULL);
7988 ptrdiff_t dst_bytes IF_LINT (= 0);
7989 ptrdiff_t chars = to - from;
7990 ptrdiff_t bytes = to_byte - from_byte;
7991 Lisp_Object attrs;
7992 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7993 bool need_marker_adjustment = 0;
7994 Lisp_Object old_deactivate_mark;
7996 old_deactivate_mark = Vdeactivate_mark;
7998 if (NILP (dst_object))
8000 destination = coding->destination;
8001 dst_bytes = coding->dst_bytes;
8004 coding->src_object = src_object;
8005 coding->src_chars = chars;
8006 coding->src_bytes = bytes;
8007 coding->src_multibyte = chars < bytes;
8009 if (STRINGP (src_object))
8011 coding->src_pos = from;
8012 coding->src_pos_byte = from_byte;
8014 else if (BUFFERP (src_object))
8016 set_buffer_internal (XBUFFER (src_object));
8017 if (from != GPT)
8018 move_gap_both (from, from_byte);
8019 if (EQ (src_object, dst_object))
8021 struct Lisp_Marker *tail;
8023 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8025 tail->need_adjustment
8026 = tail->charpos == (tail->insertion_type ? from : to);
8027 need_marker_adjustment |= tail->need_adjustment;
8029 saved_pt = PT, saved_pt_byte = PT_BYTE;
8030 TEMP_SET_PT_BOTH (from, from_byte);
8031 current_buffer->text->inhibit_shrinking = 1;
8032 del_range_both (from, from_byte, to, to_byte, 1);
8033 coding->src_pos = -chars;
8034 coding->src_pos_byte = -bytes;
8036 else
8038 coding->src_pos = from;
8039 coding->src_pos_byte = from_byte;
8043 if (CODING_REQUIRE_DETECTION (coding))
8044 detect_coding (coding);
8045 attrs = CODING_ID_ATTRS (coding->id);
8047 if (EQ (dst_object, Qt)
8048 || (! NILP (CODING_ATTR_POST_READ (attrs))
8049 && NILP (dst_object)))
8051 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8052 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8053 coding->dst_pos = BEG;
8054 coding->dst_pos_byte = BEG_BYTE;
8056 else if (BUFFERP (dst_object))
8058 code_conversion_save (0, 0);
8059 coding->dst_object = dst_object;
8060 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8061 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8062 coding->dst_multibyte
8063 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8065 else
8067 code_conversion_save (0, 0);
8068 coding->dst_object = Qnil;
8069 /* Most callers presume this will return a multibyte result, and they
8070 won't use `binary' or `raw-text' anyway, so let's not worry about
8071 CODING_FOR_UNIBYTE. */
8072 coding->dst_multibyte = 1;
8075 decode_coding (coding);
8077 if (BUFFERP (coding->dst_object))
8078 set_buffer_internal (XBUFFER (coding->dst_object));
8080 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8082 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8083 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8084 Lisp_Object val;
8086 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8087 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8088 old_deactivate_mark);
8089 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8090 make_number (coding->produced_char));
8091 UNGCPRO;
8092 CHECK_NATNUM (val);
8093 coding->produced_char += Z - prev_Z;
8094 coding->produced += Z_BYTE - prev_Z_BYTE;
8097 if (EQ (dst_object, Qt))
8099 coding->dst_object = Fbuffer_string ();
8101 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8103 set_buffer_internal (XBUFFER (coding->dst_object));
8104 if (dst_bytes < coding->produced)
8106 eassert (coding->produced > 0);
8107 destination = xrealloc (destination, coding->produced);
8108 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8109 move_gap_both (BEGV, BEGV_BYTE);
8110 memcpy (destination, BEGV_ADDR, coding->produced);
8111 coding->destination = destination;
8115 if (saved_pt >= 0)
8117 /* This is the case of:
8118 (BUFFERP (src_object) && EQ (src_object, dst_object))
8119 As we have moved PT while replacing the original buffer
8120 contents, we must recover it now. */
8121 set_buffer_internal (XBUFFER (src_object));
8122 current_buffer->text->inhibit_shrinking = 0;
8123 if (saved_pt < from)
8124 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8125 else if (saved_pt < from + chars)
8126 TEMP_SET_PT_BOTH (from, from_byte);
8127 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8128 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8129 saved_pt_byte + (coding->produced - bytes));
8130 else
8131 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8132 saved_pt_byte + (coding->produced - bytes));
8134 if (need_marker_adjustment)
8136 struct Lisp_Marker *tail;
8138 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8139 if (tail->need_adjustment)
8141 tail->need_adjustment = 0;
8142 if (tail->insertion_type)
8144 tail->bytepos = from_byte;
8145 tail->charpos = from;
8147 else
8149 tail->bytepos = from_byte + coding->produced;
8150 tail->charpos
8151 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8152 ? tail->bytepos : from + coding->produced_char);
8158 Vdeactivate_mark = old_deactivate_mark;
8159 unbind_to (count, coding->dst_object);
8163 void
8164 encode_coding_object (struct coding_system *coding,
8165 Lisp_Object src_object,
8166 ptrdiff_t from, ptrdiff_t from_byte,
8167 ptrdiff_t to, ptrdiff_t to_byte,
8168 Lisp_Object dst_object)
8170 ptrdiff_t count = SPECPDL_INDEX ();
8171 ptrdiff_t chars = to - from;
8172 ptrdiff_t bytes = to_byte - from_byte;
8173 Lisp_Object attrs;
8174 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8175 bool need_marker_adjustment = 0;
8176 bool kill_src_buffer = 0;
8177 Lisp_Object old_deactivate_mark;
8179 old_deactivate_mark = Vdeactivate_mark;
8181 coding->src_object = src_object;
8182 coding->src_chars = chars;
8183 coding->src_bytes = bytes;
8184 coding->src_multibyte = chars < bytes;
8186 attrs = CODING_ID_ATTRS (coding->id);
8188 if (EQ (src_object, dst_object))
8190 struct Lisp_Marker *tail;
8192 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8194 tail->need_adjustment
8195 = tail->charpos == (tail->insertion_type ? from : to);
8196 need_marker_adjustment |= tail->need_adjustment;
8200 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8202 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8203 set_buffer_internal (XBUFFER (coding->src_object));
8204 if (STRINGP (src_object))
8205 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8206 else if (BUFFERP (src_object))
8207 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8208 else
8209 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8211 if (EQ (src_object, dst_object))
8213 set_buffer_internal (XBUFFER (src_object));
8214 saved_pt = PT, saved_pt_byte = PT_BYTE;
8215 del_range_both (from, from_byte, to, to_byte, 1);
8216 set_buffer_internal (XBUFFER (coding->src_object));
8220 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8222 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8223 old_deactivate_mark);
8224 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8225 make_number (BEG), make_number (Z));
8226 UNGCPRO;
8228 if (XBUFFER (coding->src_object) != current_buffer)
8229 kill_src_buffer = 1;
8230 coding->src_object = Fcurrent_buffer ();
8231 if (BEG != GPT)
8232 move_gap_both (BEG, BEG_BYTE);
8233 coding->src_chars = Z - BEG;
8234 coding->src_bytes = Z_BYTE - BEG_BYTE;
8235 coding->src_pos = BEG;
8236 coding->src_pos_byte = BEG_BYTE;
8237 coding->src_multibyte = Z < Z_BYTE;
8239 else if (STRINGP (src_object))
8241 code_conversion_save (0, 0);
8242 coding->src_pos = from;
8243 coding->src_pos_byte = from_byte;
8245 else if (BUFFERP (src_object))
8247 code_conversion_save (0, 0);
8248 set_buffer_internal (XBUFFER (src_object));
8249 if (EQ (src_object, dst_object))
8251 saved_pt = PT, saved_pt_byte = PT_BYTE;
8252 coding->src_object = del_range_1 (from, to, 1, 1);
8253 coding->src_pos = 0;
8254 coding->src_pos_byte = 0;
8256 else
8258 if (from < GPT && to >= GPT)
8259 move_gap_both (from, from_byte);
8260 coding->src_pos = from;
8261 coding->src_pos_byte = from_byte;
8264 else
8265 code_conversion_save (0, 0);
8267 if (BUFFERP (dst_object))
8269 coding->dst_object = dst_object;
8270 if (EQ (src_object, dst_object))
8272 coding->dst_pos = from;
8273 coding->dst_pos_byte = from_byte;
8275 else
8277 struct buffer *current = current_buffer;
8279 set_buffer_temp (XBUFFER (dst_object));
8280 coding->dst_pos = PT;
8281 coding->dst_pos_byte = PT_BYTE;
8282 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8283 set_buffer_temp (current);
8285 coding->dst_multibyte
8286 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8288 else if (EQ (dst_object, Qt))
8290 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8291 coding->dst_object = Qnil;
8292 coding->destination = xmalloc (dst_bytes);
8293 coding->dst_bytes = dst_bytes;
8294 coding->dst_multibyte = 0;
8296 else
8298 coding->dst_object = Qnil;
8299 coding->dst_multibyte = 0;
8302 encode_coding (coding);
8304 if (EQ (dst_object, Qt))
8306 if (BUFFERP (coding->dst_object))
8307 coding->dst_object = Fbuffer_string ();
8308 else
8310 coding->dst_object
8311 = make_unibyte_string ((char *) coding->destination,
8312 coding->produced);
8313 xfree (coding->destination);
8317 if (saved_pt >= 0)
8319 /* This is the case of:
8320 (BUFFERP (src_object) && EQ (src_object, dst_object))
8321 As we have moved PT while replacing the original buffer
8322 contents, we must recover it now. */
8323 set_buffer_internal (XBUFFER (src_object));
8324 if (saved_pt < from)
8325 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8326 else if (saved_pt < from + chars)
8327 TEMP_SET_PT_BOTH (from, from_byte);
8328 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8329 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8330 saved_pt_byte + (coding->produced - bytes));
8331 else
8332 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8333 saved_pt_byte + (coding->produced - bytes));
8335 if (need_marker_adjustment)
8337 struct Lisp_Marker *tail;
8339 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8340 if (tail->need_adjustment)
8342 tail->need_adjustment = 0;
8343 if (tail->insertion_type)
8345 tail->bytepos = from_byte;
8346 tail->charpos = from;
8348 else
8350 tail->bytepos = from_byte + coding->produced;
8351 tail->charpos
8352 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8353 ? tail->bytepos : from + coding->produced_char);
8359 if (kill_src_buffer)
8360 Fkill_buffer (coding->src_object);
8362 Vdeactivate_mark = old_deactivate_mark;
8363 unbind_to (count, Qnil);
8367 Lisp_Object
8368 preferred_coding_system (void)
8370 int id = coding_categories[coding_priorities[0]].id;
8372 return CODING_ID_NAME (id);
8375 #if defined (WINDOWSNT) || defined (CYGWIN)
8377 Lisp_Object
8378 from_unicode (Lisp_Object str)
8380 CHECK_STRING (str);
8381 if (!STRING_MULTIBYTE (str) &&
8382 SBYTES (str) & 1)
8384 str = Fsubstring (str, make_number (0), make_number (-1));
8387 return code_convert_string_norecord (str, Qutf_16le, 0);
8390 Lisp_Object
8391 from_unicode_buffer (const wchar_t* wstr)
8393 return from_unicode (
8394 make_unibyte_string (
8395 (char*) wstr,
8396 /* we get one of the two final 0 bytes for free. */
8397 1 + sizeof (wchar_t) * wcslen (wstr)));
8400 wchar_t *
8401 to_unicode (Lisp_Object str, Lisp_Object *buf)
8403 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8404 /* We need to make another copy (in addition to the one made by
8405 code_convert_string_norecord) to ensure that the final string is
8406 _doubly_ zero terminated --- that is, that the string is
8407 terminated by two zero bytes and one utf-16le null character.
8408 Because strings are already terminated with a single zero byte,
8409 we just add one additional zero. */
8410 str = make_uninit_string (SBYTES (*buf) + 1);
8411 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8412 SDATA (str) [SBYTES (*buf)] = '\0';
8413 *buf = str;
8414 return WCSDATA (*buf);
8417 #endif /* WINDOWSNT || CYGWIN */
8420 #ifdef emacs
8421 /*** 8. Emacs Lisp library functions ***/
8423 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8424 doc: /* Return t if OBJECT is nil or a coding-system.
8425 See the documentation of `define-coding-system' for information
8426 about coding-system objects. */)
8427 (Lisp_Object object)
8429 if (NILP (object)
8430 || CODING_SYSTEM_ID (object) >= 0)
8431 return Qt;
8432 if (! SYMBOLP (object)
8433 || NILP (Fget (object, Qcoding_system_define_form)))
8434 return Qnil;
8435 return Qt;
8438 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8439 Sread_non_nil_coding_system, 1, 1, 0,
8440 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8441 (Lisp_Object prompt)
8443 Lisp_Object val;
8446 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8447 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8449 while (SCHARS (val) == 0);
8450 return (Fintern (val, Qnil));
8453 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8454 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8455 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8456 Ignores case when completing coding systems (all Emacs coding systems
8457 are lower-case). */)
8458 (Lisp_Object prompt, Lisp_Object default_coding_system)
8460 Lisp_Object val;
8461 ptrdiff_t count = SPECPDL_INDEX ();
8463 if (SYMBOLP (default_coding_system))
8464 default_coding_system = SYMBOL_NAME (default_coding_system);
8465 specbind (Qcompletion_ignore_case, Qt);
8466 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8467 Qt, Qnil, Qcoding_system_history,
8468 default_coding_system, Qnil);
8469 unbind_to (count, Qnil);
8470 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8473 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8474 1, 1, 0,
8475 doc: /* Check validity of CODING-SYSTEM.
8476 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8477 It is valid if it is nil or a symbol defined as a coding system by the
8478 function `define-coding-system'. */)
8479 (Lisp_Object coding_system)
8481 Lisp_Object define_form;
8483 define_form = Fget (coding_system, Qcoding_system_define_form);
8484 if (! NILP (define_form))
8486 Fput (coding_system, Qcoding_system_define_form, Qnil);
8487 safe_eval (define_form);
8489 if (!NILP (Fcoding_system_p (coding_system)))
8490 return coding_system;
8491 xsignal1 (Qcoding_system_error, coding_system);
8495 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8496 HIGHEST, return the coding system of the highest
8497 priority among the detected coding systems. Otherwise return a
8498 list of detected coding systems sorted by their priorities. If
8499 MULTIBYTEP, it is assumed that the bytes are in correct
8500 multibyte form but contains only ASCII and eight-bit chars.
8501 Otherwise, the bytes are raw bytes.
8503 CODING-SYSTEM controls the detection as below:
8505 If it is nil, detect both text-format and eol-format. If the
8506 text-format part of CODING-SYSTEM is already specified
8507 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8508 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8509 detect only text-format. */
8511 Lisp_Object
8512 detect_coding_system (const unsigned char *src,
8513 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8514 bool highest, bool multibytep,
8515 Lisp_Object coding_system)
8517 const unsigned char *src_end = src + src_bytes;
8518 Lisp_Object attrs, eol_type;
8519 Lisp_Object val = Qnil;
8520 struct coding_system coding;
8521 ptrdiff_t id;
8522 struct coding_detection_info detect_info;
8523 enum coding_category base_category;
8524 bool null_byte_found = 0, eight_bit_found = 0;
8526 if (NILP (coding_system))
8527 coding_system = Qundecided;
8528 setup_coding_system (coding_system, &coding);
8529 attrs = CODING_ID_ATTRS (coding.id);
8530 eol_type = CODING_ID_EOL_TYPE (coding.id);
8531 coding_system = CODING_ATTR_BASE_NAME (attrs);
8533 coding.source = src;
8534 coding.src_chars = src_chars;
8535 coding.src_bytes = src_bytes;
8536 coding.src_multibyte = multibytep;
8537 coding.consumed = 0;
8538 coding.mode |= CODING_MODE_LAST_BLOCK;
8539 coding.head_ascii = 0;
8541 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8543 /* At first, detect text-format if necessary. */
8544 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8545 if (base_category == coding_category_undecided)
8547 enum coding_category category IF_LINT (= 0);
8548 struct coding_system *this IF_LINT (= NULL);
8549 int c, i;
8550 bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8551 inhibit_null_byte_detection);
8552 bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8553 inhibit_iso_escape_detection);
8554 bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8556 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8557 for (; src < src_end; src++)
8559 c = *src;
8560 if (c & 0x80)
8562 eight_bit_found = 1;
8563 if (null_byte_found)
8564 break;
8566 else if (c < 0x20)
8568 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8569 && ! inhibit_ied
8570 && ! detect_info.checked)
8572 if (detect_coding_iso_2022 (&coding, &detect_info))
8574 /* We have scanned the whole data. */
8575 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8577 /* We didn't find an 8-bit code. We may
8578 have found a null-byte, but it's very
8579 rare that a binary file confirm to
8580 ISO-2022. */
8581 src = src_end;
8582 coding.head_ascii = src - coding.source;
8584 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8585 break;
8588 else if (! c && !inhibit_nbd)
8590 null_byte_found = 1;
8591 if (eight_bit_found)
8592 break;
8594 if (! eight_bit_found)
8595 coding.head_ascii++;
8597 else if (! eight_bit_found)
8598 coding.head_ascii++;
8601 if (null_byte_found || eight_bit_found
8602 || coding.head_ascii < coding.src_bytes
8603 || detect_info.found)
8605 if (coding.head_ascii == coding.src_bytes)
8606 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8607 for (i = 0; i < coding_category_raw_text; i++)
8609 category = coding_priorities[i];
8610 this = coding_categories + category;
8611 if (detect_info.found & (1 << category))
8612 break;
8614 else
8616 if (null_byte_found)
8618 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8619 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8621 else if (prefer_utf_8
8622 && detect_coding_utf_8 (&coding, &detect_info))
8624 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8625 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8627 for (i = 0; i < coding_category_raw_text; i++)
8629 category = coding_priorities[i];
8630 this = coding_categories + category;
8632 if (this->id < 0)
8634 /* No coding system of this category is defined. */
8635 detect_info.rejected |= (1 << category);
8637 else if (category >= coding_category_raw_text)
8638 continue;
8639 else if (detect_info.checked & (1 << category))
8641 if (highest
8642 && (detect_info.found & (1 << category)))
8643 break;
8645 else if ((*(this->detector)) (&coding, &detect_info)
8646 && highest
8647 && (detect_info.found & (1 << category)))
8649 if (category == coding_category_utf_16_auto)
8651 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8652 category = coding_category_utf_16_le;
8653 else
8654 category = coding_category_utf_16_be;
8656 break;
8662 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8663 || null_byte_found)
8665 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8666 id = CODING_SYSTEM_ID (Qno_conversion);
8667 val = list1 (make_number (id));
8669 else if (! detect_info.rejected && ! detect_info.found)
8671 detect_info.found = CATEGORY_MASK_ANY;
8672 id = coding_categories[coding_category_undecided].id;
8673 val = list1 (make_number (id));
8675 else if (highest)
8677 if (detect_info.found)
8679 detect_info.found = 1 << category;
8680 val = list1 (make_number (this->id));
8682 else
8683 for (i = 0; i < coding_category_raw_text; i++)
8684 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8686 detect_info.found = 1 << coding_priorities[i];
8687 id = coding_categories[coding_priorities[i]].id;
8688 val = list1 (make_number (id));
8689 break;
8692 else
8694 int mask = detect_info.rejected | detect_info.found;
8695 int found = 0;
8697 for (i = coding_category_raw_text - 1; i >= 0; i--)
8699 category = coding_priorities[i];
8700 if (! (mask & (1 << category)))
8702 found |= 1 << category;
8703 id = coding_categories[category].id;
8704 if (id >= 0)
8705 val = list1 (make_number (id));
8708 for (i = coding_category_raw_text - 1; i >= 0; i--)
8710 category = coding_priorities[i];
8711 if (detect_info.found & (1 << category))
8713 id = coding_categories[category].id;
8714 val = Fcons (make_number (id), val);
8717 detect_info.found |= found;
8720 else if (base_category == coding_category_utf_8_auto)
8722 if (detect_coding_utf_8 (&coding, &detect_info))
8724 struct coding_system *this;
8726 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8727 this = coding_categories + coding_category_utf_8_sig;
8728 else
8729 this = coding_categories + coding_category_utf_8_nosig;
8730 val = list1 (make_number (this->id));
8733 else if (base_category == coding_category_utf_16_auto)
8735 if (detect_coding_utf_16 (&coding, &detect_info))
8737 struct coding_system *this;
8739 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8740 this = coding_categories + coding_category_utf_16_le;
8741 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8742 this = coding_categories + coding_category_utf_16_be;
8743 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8744 this = coding_categories + coding_category_utf_16_be_nosig;
8745 else
8746 this = coding_categories + coding_category_utf_16_le_nosig;
8747 val = list1 (make_number (this->id));
8750 else
8752 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8753 val = list1 (make_number (coding.id));
8756 /* Then, detect eol-format if necessary. */
8758 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8759 Lisp_Object tail;
8761 if (VECTORP (eol_type))
8763 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8765 if (null_byte_found)
8766 normal_eol = EOL_SEEN_LF;
8767 else
8768 normal_eol = detect_eol (coding.source, src_bytes,
8769 coding_category_raw_text);
8771 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8772 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8773 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8774 coding_category_utf_16_be);
8775 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8776 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8777 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8778 coding_category_utf_16_le);
8780 else
8782 if (EQ (eol_type, Qunix))
8783 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8784 else if (EQ (eol_type, Qdos))
8785 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8786 else
8787 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8790 for (tail = val; CONSP (tail); tail = XCDR (tail))
8792 enum coding_category category;
8793 int this_eol;
8795 id = XINT (XCAR (tail));
8796 attrs = CODING_ID_ATTRS (id);
8797 category = XINT (CODING_ATTR_CATEGORY (attrs));
8798 eol_type = CODING_ID_EOL_TYPE (id);
8799 if (VECTORP (eol_type))
8801 if (category == coding_category_utf_16_be
8802 || category == coding_category_utf_16_be_nosig)
8803 this_eol = utf_16_be_eol;
8804 else if (category == coding_category_utf_16_le
8805 || category == coding_category_utf_16_le_nosig)
8806 this_eol = utf_16_le_eol;
8807 else
8808 this_eol = normal_eol;
8810 if (this_eol == EOL_SEEN_LF)
8811 XSETCAR (tail, AREF (eol_type, 0));
8812 else if (this_eol == EOL_SEEN_CRLF)
8813 XSETCAR (tail, AREF (eol_type, 1));
8814 else if (this_eol == EOL_SEEN_CR)
8815 XSETCAR (tail, AREF (eol_type, 2));
8816 else
8817 XSETCAR (tail, CODING_ID_NAME (id));
8819 else
8820 XSETCAR (tail, CODING_ID_NAME (id));
8824 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8828 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8829 2, 3, 0,
8830 doc: /* Detect coding system of the text in the region between START and END.
8831 Return a list of possible coding systems ordered by priority.
8832 The coding systems to try and their priorities follows what
8833 the function `coding-system-priority-list' (which see) returns.
8835 If only ASCII characters are found (except for such ISO-2022 control
8836 characters as ESC), it returns a list of single element `undecided'
8837 or its subsidiary coding system according to a detected end-of-line
8838 format.
8840 If optional argument HIGHEST is non-nil, return the coding system of
8841 highest priority. */)
8842 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8844 ptrdiff_t from, to;
8845 ptrdiff_t from_byte, to_byte;
8847 validate_region (&start, &end);
8848 from = XINT (start), to = XINT (end);
8849 from_byte = CHAR_TO_BYTE (from);
8850 to_byte = CHAR_TO_BYTE (to);
8852 if (from < GPT && to >= GPT)
8853 move_gap_both (to, to_byte);
8855 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8856 to - from, to_byte - from_byte,
8857 !NILP (highest),
8858 !NILP (BVAR (current_buffer
8859 , enable_multibyte_characters)),
8860 Qnil);
8863 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8864 1, 2, 0,
8865 doc: /* Detect coding system of the text in STRING.
8866 Return a list of possible coding systems ordered by priority.
8867 The coding systems to try and their priorities follows what
8868 the function `coding-system-priority-list' (which see) returns.
8870 If only ASCII characters are found (except for such ISO-2022 control
8871 characters as ESC), it returns a list of single element `undecided'
8872 or its subsidiary coding system according to a detected end-of-line
8873 format.
8875 If optional argument HIGHEST is non-nil, return the coding system of
8876 highest priority. */)
8877 (Lisp_Object string, Lisp_Object highest)
8879 CHECK_STRING (string);
8881 return detect_coding_system (SDATA (string),
8882 SCHARS (string), SBYTES (string),
8883 !NILP (highest), STRING_MULTIBYTE (string),
8884 Qnil);
8888 static bool
8889 char_encodable_p (int c, Lisp_Object attrs)
8891 Lisp_Object tail;
8892 struct charset *charset;
8893 Lisp_Object translation_table;
8895 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8896 if (! NILP (translation_table))
8897 c = translate_char (translation_table, c);
8898 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8899 CONSP (tail); tail = XCDR (tail))
8901 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8902 if (CHAR_CHARSET_P (c, charset))
8903 break;
8905 return (! NILP (tail));
8909 /* Return a list of coding systems that safely encode the text between
8910 START and END. If EXCLUDE is non-nil, it is a list of coding
8911 systems not to check. The returned list doesn't contain any such
8912 coding systems. In any case, if the text contains only ASCII or is
8913 unibyte, return t. */
8915 DEFUN ("find-coding-systems-region-internal",
8916 Ffind_coding_systems_region_internal,
8917 Sfind_coding_systems_region_internal, 2, 3, 0,
8918 doc: /* Internal use only. */)
8919 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8921 Lisp_Object coding_attrs_list, safe_codings;
8922 ptrdiff_t start_byte, end_byte;
8923 const unsigned char *p, *pbeg, *pend;
8924 int c;
8925 Lisp_Object tail, elt, work_table;
8927 if (STRINGP (start))
8929 if (!STRING_MULTIBYTE (start)
8930 || SCHARS (start) == SBYTES (start))
8931 return Qt;
8932 start_byte = 0;
8933 end_byte = SBYTES (start);
8935 else
8937 CHECK_NUMBER_COERCE_MARKER (start);
8938 CHECK_NUMBER_COERCE_MARKER (end);
8939 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8940 args_out_of_range (start, end);
8941 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8942 return Qt;
8943 start_byte = CHAR_TO_BYTE (XINT (start));
8944 end_byte = CHAR_TO_BYTE (XINT (end));
8945 if (XINT (end) - XINT (start) == end_byte - start_byte)
8946 return Qt;
8948 if (XINT (start) < GPT && XINT (end) > GPT)
8950 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8951 move_gap_both (XINT (start), start_byte);
8952 else
8953 move_gap_both (XINT (end), end_byte);
8957 coding_attrs_list = Qnil;
8958 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8959 if (NILP (exclude)
8960 || NILP (Fmemq (XCAR (tail), exclude)))
8962 Lisp_Object attrs;
8964 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8965 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8967 ASET (attrs, coding_attr_trans_tbl,
8968 get_translation_table (attrs, 1, NULL));
8969 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8973 if (STRINGP (start))
8974 p = pbeg = SDATA (start);
8975 else
8976 p = pbeg = BYTE_POS_ADDR (start_byte);
8977 pend = p + (end_byte - start_byte);
8979 while (p < pend && ASCII_BYTE_P (*p)) p++;
8980 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8982 work_table = Fmake_char_table (Qnil, Qnil);
8983 while (p < pend)
8985 if (ASCII_BYTE_P (*p))
8986 p++;
8987 else
8989 c = STRING_CHAR_ADVANCE (p);
8990 if (!NILP (char_table_ref (work_table, c)))
8991 /* This character was already checked. Ignore it. */
8992 continue;
8994 charset_map_loaded = 0;
8995 for (tail = coding_attrs_list; CONSP (tail);)
8997 elt = XCAR (tail);
8998 if (NILP (elt))
8999 tail = XCDR (tail);
9000 else if (char_encodable_p (c, elt))
9001 tail = XCDR (tail);
9002 else if (CONSP (XCDR (tail)))
9004 XSETCAR (tail, XCAR (XCDR (tail)));
9005 XSETCDR (tail, XCDR (XCDR (tail)));
9007 else
9009 XSETCAR (tail, Qnil);
9010 tail = XCDR (tail);
9013 if (charset_map_loaded)
9015 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9017 if (STRINGP (start))
9018 pbeg = SDATA (start);
9019 else
9020 pbeg = BYTE_POS_ADDR (start_byte);
9021 p = pbeg + p_offset;
9022 pend = pbeg + pend_offset;
9024 char_table_set (work_table, c, Qt);
9028 safe_codings = list2 (Qraw_text, Qno_conversion);
9029 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9030 if (! NILP (XCAR (tail)))
9031 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9033 return safe_codings;
9037 DEFUN ("unencodable-char-position", Funencodable_char_position,
9038 Sunencodable_char_position, 3, 5, 0,
9039 doc: /*
9040 Return position of first un-encodable character in a region.
9041 START and END specify the region and CODING-SYSTEM specifies the
9042 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9044 If optional 4th argument COUNT is non-nil, it specifies at most how
9045 many un-encodable characters to search. In this case, the value is a
9046 list of positions.
9048 If optional 5th argument STRING is non-nil, it is a string to search
9049 for un-encodable characters. In that case, START and END are indexes
9050 to the string. */)
9051 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9053 EMACS_INT n;
9054 struct coding_system coding;
9055 Lisp_Object attrs, charset_list, translation_table;
9056 Lisp_Object positions;
9057 ptrdiff_t from, to;
9058 const unsigned char *p, *stop, *pend;
9059 bool ascii_compatible;
9061 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9062 attrs = CODING_ID_ATTRS (coding.id);
9063 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9064 return Qnil;
9065 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9066 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9067 translation_table = get_translation_table (attrs, 1, NULL);
9069 if (NILP (string))
9071 validate_region (&start, &end);
9072 from = XINT (start);
9073 to = XINT (end);
9074 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9075 || (ascii_compatible
9076 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9077 return Qnil;
9078 p = CHAR_POS_ADDR (from);
9079 pend = CHAR_POS_ADDR (to);
9080 if (from < GPT && to >= GPT)
9081 stop = GPT_ADDR;
9082 else
9083 stop = pend;
9085 else
9087 CHECK_STRING (string);
9088 CHECK_NATNUM (start);
9089 CHECK_NATNUM (end);
9090 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9091 args_out_of_range_3 (string, start, end);
9092 from = XINT (start);
9093 to = XINT (end);
9094 if (! STRING_MULTIBYTE (string))
9095 return Qnil;
9096 p = SDATA (string) + string_char_to_byte (string, from);
9097 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9098 if (ascii_compatible && (to - from) == (pend - p))
9099 return Qnil;
9102 if (NILP (count))
9103 n = 1;
9104 else
9106 CHECK_NATNUM (count);
9107 n = XINT (count);
9110 positions = Qnil;
9111 charset_map_loaded = 0;
9112 while (1)
9114 int c;
9116 if (ascii_compatible)
9117 while (p < stop && ASCII_BYTE_P (*p))
9118 p++, from++;
9119 if (p >= stop)
9121 if (p >= pend)
9122 break;
9123 stop = pend;
9124 p = GAP_END_ADDR;
9127 c = STRING_CHAR_ADVANCE (p);
9128 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9129 && ! char_charset (translate_char (translation_table, c),
9130 charset_list, NULL))
9132 positions = Fcons (make_number (from), positions);
9133 n--;
9134 if (n == 0)
9135 break;
9138 from++;
9139 if (charset_map_loaded && NILP (string))
9141 p = CHAR_POS_ADDR (from);
9142 pend = CHAR_POS_ADDR (to);
9143 if (from < GPT && to >= GPT)
9144 stop = GPT_ADDR;
9145 else
9146 stop = pend;
9147 charset_map_loaded = 0;
9151 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9155 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9156 Scheck_coding_systems_region, 3, 3, 0,
9157 doc: /* Check if the region is encodable by coding systems.
9159 START and END are buffer positions specifying the region.
9160 CODING-SYSTEM-LIST is a list of coding systems to check.
9162 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9163 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9164 whole region, POS0, POS1, ... are buffer positions where non-encodable
9165 characters are found.
9167 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9168 value is nil.
9170 START may be a string. In that case, check if the string is
9171 encodable, and the value contains indices to the string instead of
9172 buffer positions. END is ignored.
9174 If the current buffer (or START if it is a string) is unibyte, the value
9175 is nil. */)
9176 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9178 Lisp_Object list;
9179 ptrdiff_t start_byte, end_byte;
9180 ptrdiff_t pos;
9181 const unsigned char *p, *pbeg, *pend;
9182 int c;
9183 Lisp_Object tail, elt, attrs;
9185 if (STRINGP (start))
9187 if (!STRING_MULTIBYTE (start)
9188 || SCHARS (start) == SBYTES (start))
9189 return Qnil;
9190 start_byte = 0;
9191 end_byte = SBYTES (start);
9192 pos = 0;
9194 else
9196 CHECK_NUMBER_COERCE_MARKER (start);
9197 CHECK_NUMBER_COERCE_MARKER (end);
9198 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9199 args_out_of_range (start, end);
9200 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9201 return Qnil;
9202 start_byte = CHAR_TO_BYTE (XINT (start));
9203 end_byte = CHAR_TO_BYTE (XINT (end));
9204 if (XINT (end) - XINT (start) == end_byte - start_byte)
9205 return Qnil;
9207 if (XINT (start) < GPT && XINT (end) > GPT)
9209 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9210 move_gap_both (XINT (start), start_byte);
9211 else
9212 move_gap_both (XINT (end), end_byte);
9214 pos = XINT (start);
9217 list = Qnil;
9218 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9220 elt = XCAR (tail);
9221 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9222 ASET (attrs, coding_attr_trans_tbl,
9223 get_translation_table (attrs, 1, NULL));
9224 list = Fcons (list2 (elt, attrs), list);
9227 if (STRINGP (start))
9228 p = pbeg = SDATA (start);
9229 else
9230 p = pbeg = BYTE_POS_ADDR (start_byte);
9231 pend = p + (end_byte - start_byte);
9233 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9234 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9236 while (p < pend)
9238 if (ASCII_BYTE_P (*p))
9239 p++;
9240 else
9242 c = STRING_CHAR_ADVANCE (p);
9244 charset_map_loaded = 0;
9245 for (tail = list; CONSP (tail); tail = XCDR (tail))
9247 elt = XCDR (XCAR (tail));
9248 if (! char_encodable_p (c, XCAR (elt)))
9249 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9251 if (charset_map_loaded)
9253 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9255 if (STRINGP (start))
9256 pbeg = SDATA (start);
9257 else
9258 pbeg = BYTE_POS_ADDR (start_byte);
9259 p = pbeg + p_offset;
9260 pend = pbeg + pend_offset;
9263 pos++;
9266 tail = list;
9267 list = Qnil;
9268 for (; CONSP (tail); tail = XCDR (tail))
9270 elt = XCAR (tail);
9271 if (CONSP (XCDR (XCDR (elt))))
9272 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9273 list);
9276 return list;
9280 static Lisp_Object
9281 code_convert_region (Lisp_Object start, Lisp_Object end,
9282 Lisp_Object coding_system, Lisp_Object dst_object,
9283 bool encodep, bool norecord)
9285 struct coding_system coding;
9286 ptrdiff_t from, from_byte, to, to_byte;
9287 Lisp_Object src_object;
9289 if (NILP (coding_system))
9290 coding_system = Qno_conversion;
9291 else
9292 CHECK_CODING_SYSTEM (coding_system);
9293 src_object = Fcurrent_buffer ();
9294 if (NILP (dst_object))
9295 dst_object = src_object;
9296 else if (! EQ (dst_object, Qt))
9297 CHECK_BUFFER (dst_object);
9299 validate_region (&start, &end);
9300 from = XFASTINT (start);
9301 from_byte = CHAR_TO_BYTE (from);
9302 to = XFASTINT (end);
9303 to_byte = CHAR_TO_BYTE (to);
9305 setup_coding_system (coding_system, &coding);
9306 coding.mode |= CODING_MODE_LAST_BLOCK;
9308 if (encodep)
9309 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9310 dst_object);
9311 else
9312 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313 dst_object);
9314 if (! norecord)
9315 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9317 return (BUFFERP (dst_object)
9318 ? make_number (coding.produced_char)
9319 : coding.dst_object);
9323 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9324 3, 4, "r\nzCoding system: ",
9325 doc: /* Decode the current region from the specified coding system.
9326 When called from a program, takes four arguments:
9327 START, END, CODING-SYSTEM, and DESTINATION.
9328 START and END are buffer positions.
9330 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9331 If nil, the region between START and END is replaced by the decoded text.
9332 If buffer, the decoded text is inserted in that buffer after point (point
9333 does not move).
9334 In those cases, the length of the decoded text is returned.
9335 If DESTINATION is t, the decoded text is returned.
9337 This function sets `last-coding-system-used' to the precise coding system
9338 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9339 not fully specified.) */)
9340 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9342 return code_convert_region (start, end, coding_system, destination, 0, 0);
9345 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9346 3, 4, "r\nzCoding system: ",
9347 doc: /* Encode the current region by specified coding system.
9348 When called from a program, takes four arguments:
9349 START, END, CODING-SYSTEM and DESTINATION.
9350 START and END are buffer positions.
9352 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9353 If nil, the region between START and END is replace by the encoded text.
9354 If buffer, the encoded text is inserted in that buffer after point (point
9355 does not move).
9356 In those cases, the length of the encoded text is returned.
9357 If DESTINATION is t, the encoded text is returned.
9359 This function sets `last-coding-system-used' to the precise coding system
9360 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9361 not fully specified.) */)
9362 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9364 return code_convert_region (start, end, coding_system, destination, 1, 0);
9367 Lisp_Object
9368 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9369 Lisp_Object dst_object, bool encodep, bool nocopy,
9370 bool norecord)
9372 struct coding_system coding;
9373 ptrdiff_t chars, bytes;
9375 CHECK_STRING (string);
9376 if (NILP (coding_system))
9378 if (! norecord)
9379 Vlast_coding_system_used = Qno_conversion;
9380 if (NILP (dst_object))
9381 return (nocopy ? Fcopy_sequence (string) : string);
9384 if (NILP (coding_system))
9385 coding_system = Qno_conversion;
9386 else
9387 CHECK_CODING_SYSTEM (coding_system);
9388 if (NILP (dst_object))
9389 dst_object = Qt;
9390 else if (! EQ (dst_object, Qt))
9391 CHECK_BUFFER (dst_object);
9393 setup_coding_system (coding_system, &coding);
9394 coding.mode |= CODING_MODE_LAST_BLOCK;
9395 chars = SCHARS (string);
9396 bytes = SBYTES (string);
9397 if (encodep)
9398 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9399 else
9400 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9401 if (! norecord)
9402 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9404 return (BUFFERP (dst_object)
9405 ? make_number (coding.produced_char)
9406 : coding.dst_object);
9410 /* Encode or decode STRING according to CODING_SYSTEM.
9411 Do not set Vlast_coding_system_used.
9413 This function is called only from macros DECODE_FILE and
9414 ENCODE_FILE, thus we ignore character composition. */
9416 Lisp_Object
9417 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9418 bool encodep)
9420 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9424 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9425 2, 4, 0,
9426 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9428 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9429 if the decoding operation is trivial.
9431 Optional fourth arg BUFFER non-nil means that the decoded text is
9432 inserted in that buffer after point (point does not move). In this
9433 case, the return value is the length of the decoded text.
9435 This function sets `last-coding-system-used' to the precise coding system
9436 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9437 not fully specified.) */)
9438 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9440 return code_convert_string (string, coding_system, buffer,
9441 0, ! NILP (nocopy), 0);
9444 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9445 2, 4, 0,
9446 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9448 Optional third arg NOCOPY non-nil means it is OK to return STRING
9449 itself if the encoding operation is trivial.
9451 Optional fourth arg BUFFER non-nil means that the encoded text is
9452 inserted in that buffer after point (point does not move). In this
9453 case, the return value is the length of the encoded text.
9455 This function sets `last-coding-system-used' to the precise coding system
9456 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9457 not fully specified.) */)
9458 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9460 return code_convert_string (string, coding_system, buffer,
9461 1, ! NILP (nocopy), 0);
9465 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9466 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9467 Return the corresponding character. */)
9468 (Lisp_Object code)
9470 Lisp_Object spec, attrs, val;
9471 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9472 EMACS_INT ch;
9473 int c;
9475 CHECK_NATNUM (code);
9476 ch = XFASTINT (code);
9477 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9478 attrs = AREF (spec, 0);
9480 if (ASCII_BYTE_P (ch)
9481 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9482 return code;
9484 val = CODING_ATTR_CHARSET_LIST (attrs);
9485 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9486 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9487 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9489 if (ch <= 0x7F)
9491 c = ch;
9492 charset = charset_roman;
9494 else if (ch >= 0xA0 && ch < 0xDF)
9496 c = ch - 0x80;
9497 charset = charset_kana;
9499 else
9501 EMACS_INT c1 = ch >> 8;
9502 int c2 = ch & 0xFF;
9504 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9505 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9506 error ("Invalid code: %"pI"d", ch);
9507 c = ch;
9508 SJIS_TO_JIS (c);
9509 charset = charset_kanji;
9511 c = DECODE_CHAR (charset, c);
9512 if (c < 0)
9513 error ("Invalid code: %"pI"d", ch);
9514 return make_number (c);
9518 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9519 doc: /* Encode a Japanese character CH to shift_jis encoding.
9520 Return the corresponding code in SJIS. */)
9521 (Lisp_Object ch)
9523 Lisp_Object spec, attrs, charset_list;
9524 int c;
9525 struct charset *charset;
9526 unsigned code;
9528 CHECK_CHARACTER (ch);
9529 c = XFASTINT (ch);
9530 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9531 attrs = AREF (spec, 0);
9533 if (ASCII_CHAR_P (c)
9534 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9535 return ch;
9537 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9538 charset = char_charset (c, charset_list, &code);
9539 if (code == CHARSET_INVALID_CODE (charset))
9540 error ("Can't encode by shift_jis encoding: %c", c);
9541 JIS_TO_SJIS (code);
9543 return make_number (code);
9546 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9547 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9548 Return the corresponding character. */)
9549 (Lisp_Object code)
9551 Lisp_Object spec, attrs, val;
9552 struct charset *charset_roman, *charset_big5, *charset;
9553 EMACS_INT ch;
9554 int c;
9556 CHECK_NATNUM (code);
9557 ch = XFASTINT (code);
9558 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9559 attrs = AREF (spec, 0);
9561 if (ASCII_BYTE_P (ch)
9562 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9563 return code;
9565 val = CODING_ATTR_CHARSET_LIST (attrs);
9566 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9567 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9569 if (ch <= 0x7F)
9571 c = ch;
9572 charset = charset_roman;
9574 else
9576 EMACS_INT b1 = ch >> 8;
9577 int b2 = ch & 0x7F;
9578 if (b1 < 0xA1 || b1 > 0xFE
9579 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9580 error ("Invalid code: %"pI"d", ch);
9581 c = ch;
9582 charset = charset_big5;
9584 c = DECODE_CHAR (charset, c);
9585 if (c < 0)
9586 error ("Invalid code: %"pI"d", ch);
9587 return make_number (c);
9590 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9591 doc: /* Encode the Big5 character CH to BIG5 coding system.
9592 Return the corresponding character code in Big5. */)
9593 (Lisp_Object ch)
9595 Lisp_Object spec, attrs, charset_list;
9596 struct charset *charset;
9597 int c;
9598 unsigned code;
9600 CHECK_CHARACTER (ch);
9601 c = XFASTINT (ch);
9602 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9603 attrs = AREF (spec, 0);
9604 if (ASCII_CHAR_P (c)
9605 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9606 return ch;
9608 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9609 charset = char_charset (c, charset_list, &code);
9610 if (code == CHARSET_INVALID_CODE (charset))
9611 error ("Can't encode by Big5 encoding: %c", c);
9613 return make_number (code);
9617 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9618 Sset_terminal_coding_system_internal, 1, 2, 0,
9619 doc: /* Internal use only. */)
9620 (Lisp_Object coding_system, Lisp_Object terminal)
9622 struct terminal *term = get_terminal (terminal, 1);
9623 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9624 CHECK_SYMBOL (coding_system);
9625 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9626 /* We had better not send unsafe characters to terminal. */
9627 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9628 /* Character composition should be disabled. */
9629 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9630 terminal_coding->src_multibyte = 1;
9631 terminal_coding->dst_multibyte = 0;
9632 tset_charset_list
9633 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9634 ? coding_charset_list (terminal_coding)
9635 : list1 (make_number (charset_ascii))));
9636 return Qnil;
9639 DEFUN ("set-safe-terminal-coding-system-internal",
9640 Fset_safe_terminal_coding_system_internal,
9641 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9642 doc: /* Internal use only. */)
9643 (Lisp_Object coding_system)
9645 CHECK_SYMBOL (coding_system);
9646 setup_coding_system (Fcheck_coding_system (coding_system),
9647 &safe_terminal_coding);
9648 /* Character composition should be disabled. */
9649 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9650 safe_terminal_coding.src_multibyte = 1;
9651 safe_terminal_coding.dst_multibyte = 0;
9652 return Qnil;
9655 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9656 Sterminal_coding_system, 0, 1, 0,
9657 doc: /* Return coding system specified for terminal output on the given terminal.
9658 TERMINAL may be a terminal object, a frame, or nil for the selected
9659 frame's terminal device. */)
9660 (Lisp_Object terminal)
9662 struct coding_system *terminal_coding
9663 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9664 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9666 /* For backward compatibility, return nil if it is `undecided'. */
9667 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9670 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9671 Sset_keyboard_coding_system_internal, 1, 2, 0,
9672 doc: /* Internal use only. */)
9673 (Lisp_Object coding_system, Lisp_Object terminal)
9675 struct terminal *t = get_terminal (terminal, 1);
9676 CHECK_SYMBOL (coding_system);
9677 if (NILP (coding_system))
9678 coding_system = Qno_conversion;
9679 else
9680 Fcheck_coding_system (coding_system);
9681 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9682 /* Character composition should be disabled. */
9683 TERMINAL_KEYBOARD_CODING (t)->common_flags
9684 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9685 return Qnil;
9688 DEFUN ("keyboard-coding-system",
9689 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9690 doc: /* Return coding system specified for decoding keyboard input. */)
9691 (Lisp_Object terminal)
9693 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9694 (get_terminal (terminal, 1))->id);
9698 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9699 Sfind_operation_coding_system, 1, MANY, 0,
9700 doc: /* Choose a coding system for an operation based on the target name.
9701 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9702 DECODING-SYSTEM is the coding system to use for decoding
9703 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9704 for encoding (in case OPERATION does encoding).
9706 The first argument OPERATION specifies an I/O primitive:
9707 For file I/O, `insert-file-contents' or `write-region'.
9708 For process I/O, `call-process', `call-process-region', or `start-process'.
9709 For network I/O, `open-network-stream'.
9711 The remaining arguments should be the same arguments that were passed
9712 to the primitive. Depending on which primitive, one of those arguments
9713 is selected as the TARGET. For example, if OPERATION does file I/O,
9714 whichever argument specifies the file name is TARGET.
9716 TARGET has a meaning which depends on OPERATION:
9717 For file I/O, TARGET is a file name (except for the special case below).
9718 For process I/O, TARGET is a process name.
9719 For network I/O, TARGET is a service name or a port number.
9721 This function looks up what is specified for TARGET in
9722 `file-coding-system-alist', `process-coding-system-alist',
9723 or `network-coding-system-alist' depending on OPERATION.
9724 They may specify a coding system, a cons of coding systems,
9725 or a function symbol to call.
9726 In the last case, we call the function with one argument,
9727 which is a list of all the arguments given to this function.
9728 If the function can't decide a coding system, it can return
9729 `undecided' so that the normal code-detection is performed.
9731 If OPERATION is `insert-file-contents', the argument corresponding to
9732 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9733 file name to look up, and BUFFER is a buffer that contains the file's
9734 contents (not yet decoded). If `file-coding-system-alist' specifies a
9735 function to call for FILENAME, that function should examine the
9736 contents of BUFFER instead of reading the file.
9738 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9739 (ptrdiff_t nargs, Lisp_Object *args)
9741 Lisp_Object operation, target_idx, target, val;
9742 register Lisp_Object chain;
9744 if (nargs < 2)
9745 error ("Too few arguments");
9746 operation = args[0];
9747 if (!SYMBOLP (operation)
9748 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9749 error ("Invalid first argument");
9750 if (nargs <= 1 + XFASTINT (target_idx))
9751 error ("Too few arguments for operation `%s'",
9752 SDATA (SYMBOL_NAME (operation)));
9753 target = args[XFASTINT (target_idx) + 1];
9754 if (!(STRINGP (target)
9755 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9756 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9757 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9758 error ("Invalid argument %"pI"d of operation `%s'",
9759 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9760 if (CONSP (target))
9761 target = XCAR (target);
9763 chain = ((EQ (operation, Qinsert_file_contents)
9764 || EQ (operation, Qwrite_region))
9765 ? Vfile_coding_system_alist
9766 : (EQ (operation, Qopen_network_stream)
9767 ? Vnetwork_coding_system_alist
9768 : Vprocess_coding_system_alist));
9769 if (NILP (chain))
9770 return Qnil;
9772 for (; CONSP (chain); chain = XCDR (chain))
9774 Lisp_Object elt;
9776 elt = XCAR (chain);
9777 if (CONSP (elt)
9778 && ((STRINGP (target)
9779 && STRINGP (XCAR (elt))
9780 && fast_string_match (XCAR (elt), target) >= 0)
9781 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9783 val = XCDR (elt);
9784 /* Here, if VAL is both a valid coding system and a valid
9785 function symbol, we return VAL as a coding system. */
9786 if (CONSP (val))
9787 return val;
9788 if (! SYMBOLP (val))
9789 return Qnil;
9790 if (! NILP (Fcoding_system_p (val)))
9791 return Fcons (val, val);
9792 if (! NILP (Ffboundp (val)))
9794 /* We use call1 rather than safe_call1
9795 so as to get bug reports about functions called here
9796 which don't handle the current interface. */
9797 val = call1 (val, Flist (nargs, args));
9798 if (CONSP (val))
9799 return val;
9800 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9801 return Fcons (val, val);
9803 return Qnil;
9806 return Qnil;
9809 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9810 Sset_coding_system_priority, 0, MANY, 0,
9811 doc: /* Assign higher priority to the coding systems given as arguments.
9812 If multiple coding systems belong to the same category,
9813 all but the first one are ignored.
9815 usage: (set-coding-system-priority &rest coding-systems) */)
9816 (ptrdiff_t nargs, Lisp_Object *args)
9818 ptrdiff_t i, j;
9819 bool changed[coding_category_max];
9820 enum coding_category priorities[coding_category_max];
9822 memset (changed, 0, sizeof changed);
9824 for (i = j = 0; i < nargs; i++)
9826 enum coding_category category;
9827 Lisp_Object spec, attrs;
9829 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9830 attrs = AREF (spec, 0);
9831 category = XINT (CODING_ATTR_CATEGORY (attrs));
9832 if (changed[category])
9833 /* Ignore this coding system because a coding system of the
9834 same category already had a higher priority. */
9835 continue;
9836 changed[category] = 1;
9837 priorities[j++] = category;
9838 if (coding_categories[category].id >= 0
9839 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9840 setup_coding_system (args[i], &coding_categories[category]);
9841 Fset (AREF (Vcoding_category_table, category), args[i]);
9844 /* Now we have decided top J priorities. Reflect the order of the
9845 original priorities to the remaining priorities. */
9847 for (i = j, j = 0; i < coding_category_max; i++, j++)
9849 while (j < coding_category_max
9850 && changed[coding_priorities[j]])
9851 j++;
9852 if (j == coding_category_max)
9853 emacs_abort ();
9854 priorities[i] = coding_priorities[j];
9857 memcpy (coding_priorities, priorities, sizeof priorities);
9859 /* Update `coding-category-list'. */
9860 Vcoding_category_list = Qnil;
9861 for (i = coding_category_max; i-- > 0; )
9862 Vcoding_category_list
9863 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9864 Vcoding_category_list);
9866 return Qnil;
9869 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9870 Scoding_system_priority_list, 0, 1, 0,
9871 doc: /* Return a list of coding systems ordered by their priorities.
9872 The list contains a subset of coding systems; i.e. coding systems
9873 assigned to each coding category (see `coding-category-list').
9875 HIGHESTP non-nil means just return the highest priority one. */)
9876 (Lisp_Object highestp)
9878 int i;
9879 Lisp_Object val;
9881 for (i = 0, val = Qnil; i < coding_category_max; i++)
9883 enum coding_category category = coding_priorities[i];
9884 int id = coding_categories[category].id;
9885 Lisp_Object attrs;
9887 if (id < 0)
9888 continue;
9889 attrs = CODING_ID_ATTRS (id);
9890 if (! NILP (highestp))
9891 return CODING_ATTR_BASE_NAME (attrs);
9892 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9894 return Fnreverse (val);
9897 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9899 static Lisp_Object
9900 make_subsidiaries (Lisp_Object base)
9902 Lisp_Object subsidiaries;
9903 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9904 char *buf = alloca (base_name_len + 6);
9905 int i;
9907 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9908 subsidiaries = make_uninit_vector (3);
9909 for (i = 0; i < 3; i++)
9911 strcpy (buf + base_name_len, suffixes[i]);
9912 ASET (subsidiaries, i, intern (buf));
9914 return subsidiaries;
9918 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9919 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9920 doc: /* For internal use only.
9921 usage: (define-coding-system-internal ...) */)
9922 (ptrdiff_t nargs, Lisp_Object *args)
9924 Lisp_Object name;
9925 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9926 Lisp_Object attrs; /* Vector of attributes. */
9927 Lisp_Object eol_type;
9928 Lisp_Object aliases;
9929 Lisp_Object coding_type, charset_list, safe_charsets;
9930 enum coding_category category;
9931 Lisp_Object tail, val;
9932 int max_charset_id = 0;
9933 int i;
9935 if (nargs < coding_arg_max)
9936 goto short_args;
9938 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9940 name = args[coding_arg_name];
9941 CHECK_SYMBOL (name);
9942 ASET (attrs, coding_attr_base_name, name);
9944 val = args[coding_arg_mnemonic];
9945 if (! STRINGP (val))
9946 CHECK_CHARACTER (val);
9947 ASET (attrs, coding_attr_mnemonic, val);
9949 coding_type = args[coding_arg_coding_type];
9950 CHECK_SYMBOL (coding_type);
9951 ASET (attrs, coding_attr_type, coding_type);
9953 charset_list = args[coding_arg_charset_list];
9954 if (SYMBOLP (charset_list))
9956 if (EQ (charset_list, Qiso_2022))
9958 if (! EQ (coding_type, Qiso_2022))
9959 error ("Invalid charset-list");
9960 charset_list = Viso_2022_charset_list;
9962 else if (EQ (charset_list, Qemacs_mule))
9964 if (! EQ (coding_type, Qemacs_mule))
9965 error ("Invalid charset-list");
9966 charset_list = Vemacs_mule_charset_list;
9968 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9970 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9971 error ("Invalid charset-list");
9972 if (max_charset_id < XFASTINT (XCAR (tail)))
9973 max_charset_id = XFASTINT (XCAR (tail));
9976 else
9978 charset_list = Fcopy_sequence (charset_list);
9979 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9981 struct charset *charset;
9983 val = XCAR (tail);
9984 CHECK_CHARSET_GET_CHARSET (val, charset);
9985 if (EQ (coding_type, Qiso_2022)
9986 ? CHARSET_ISO_FINAL (charset) < 0
9987 : EQ (coding_type, Qemacs_mule)
9988 ? CHARSET_EMACS_MULE_ID (charset) < 0
9989 : 0)
9990 error ("Can't handle charset `%s'",
9991 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9993 XSETCAR (tail, make_number (charset->id));
9994 if (max_charset_id < charset->id)
9995 max_charset_id = charset->id;
9998 ASET (attrs, coding_attr_charset_list, charset_list);
10000 safe_charsets = make_uninit_string (max_charset_id + 1);
10001 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10002 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10003 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10004 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10006 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10008 val = args[coding_arg_decode_translation_table];
10009 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10010 CHECK_SYMBOL (val);
10011 ASET (attrs, coding_attr_decode_tbl, val);
10013 val = args[coding_arg_encode_translation_table];
10014 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10015 CHECK_SYMBOL (val);
10016 ASET (attrs, coding_attr_encode_tbl, val);
10018 val = args[coding_arg_post_read_conversion];
10019 CHECK_SYMBOL (val);
10020 ASET (attrs, coding_attr_post_read, val);
10022 val = args[coding_arg_pre_write_conversion];
10023 CHECK_SYMBOL (val);
10024 ASET (attrs, coding_attr_pre_write, val);
10026 val = args[coding_arg_default_char];
10027 if (NILP (val))
10028 ASET (attrs, coding_attr_default_char, make_number (' '));
10029 else
10031 CHECK_CHARACTER (val);
10032 ASET (attrs, coding_attr_default_char, val);
10035 val = args[coding_arg_for_unibyte];
10036 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10038 val = args[coding_arg_plist];
10039 CHECK_LIST (val);
10040 ASET (attrs, coding_attr_plist, val);
10042 if (EQ (coding_type, Qcharset))
10044 /* Generate a lisp vector of 256 elements. Each element is nil,
10045 integer, or a list of charset IDs.
10047 If Nth element is nil, the byte code N is invalid in this
10048 coding system.
10050 If Nth element is a number NUM, N is the first byte of a
10051 charset whose ID is NUM.
10053 If Nth element is a list of charset IDs, N is the first byte
10054 of one of them. The list is sorted by dimensions of the
10055 charsets. A charset of smaller dimension comes first. */
10056 val = Fmake_vector (make_number (256), Qnil);
10058 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10060 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10061 int dim = CHARSET_DIMENSION (charset);
10062 int idx = (dim - 1) * 4;
10064 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10065 ASET (attrs, coding_attr_ascii_compat, Qt);
10067 for (i = charset->code_space[idx];
10068 i <= charset->code_space[idx + 1]; i++)
10070 Lisp_Object tmp, tmp2;
10071 int dim2;
10073 tmp = AREF (val, i);
10074 if (NILP (tmp))
10075 tmp = XCAR (tail);
10076 else if (NUMBERP (tmp))
10078 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10079 if (dim < dim2)
10080 tmp = list2 (XCAR (tail), tmp);
10081 else
10082 tmp = list2 (tmp, XCAR (tail));
10084 else
10086 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10088 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10089 if (dim < dim2)
10090 break;
10092 if (NILP (tmp2))
10093 tmp = nconc2 (tmp, list1 (XCAR (tail)));
10094 else
10096 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10097 XSETCAR (tmp2, XCAR (tail));
10100 ASET (val, i, tmp);
10103 ASET (attrs, coding_attr_charset_valids, val);
10104 category = coding_category_charset;
10106 else if (EQ (coding_type, Qccl))
10108 Lisp_Object valids;
10110 if (nargs < coding_arg_ccl_max)
10111 goto short_args;
10113 val = args[coding_arg_ccl_decoder];
10114 CHECK_CCL_PROGRAM (val);
10115 if (VECTORP (val))
10116 val = Fcopy_sequence (val);
10117 ASET (attrs, coding_attr_ccl_decoder, val);
10119 val = args[coding_arg_ccl_encoder];
10120 CHECK_CCL_PROGRAM (val);
10121 if (VECTORP (val))
10122 val = Fcopy_sequence (val);
10123 ASET (attrs, coding_attr_ccl_encoder, val);
10125 val = args[coding_arg_ccl_valids];
10126 valids = Fmake_string (make_number (256), make_number (0));
10127 for (tail = val; CONSP (tail); tail = XCDR (tail))
10129 int from, to;
10131 val = XCAR (tail);
10132 if (INTEGERP (val))
10134 if (! (0 <= XINT (val) && XINT (val) <= 255))
10135 args_out_of_range_3 (val, make_number (0), make_number (255));
10136 from = to = XINT (val);
10138 else
10140 CHECK_CONS (val);
10141 CHECK_NATNUM_CAR (val);
10142 CHECK_NUMBER_CDR (val);
10143 if (XINT (XCAR (val)) > 255)
10144 args_out_of_range_3 (XCAR (val),
10145 make_number (0), make_number (255));
10146 from = XINT (XCAR (val));
10147 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10148 args_out_of_range_3 (XCDR (val),
10149 XCAR (val), make_number (255));
10150 to = XINT (XCDR (val));
10152 for (i = from; i <= to; i++)
10153 SSET (valids, i, 1);
10155 ASET (attrs, coding_attr_ccl_valids, valids);
10157 category = coding_category_ccl;
10159 else if (EQ (coding_type, Qutf_16))
10161 Lisp_Object bom, endian;
10163 ASET (attrs, coding_attr_ascii_compat, Qnil);
10165 if (nargs < coding_arg_utf16_max)
10166 goto short_args;
10168 bom = args[coding_arg_utf16_bom];
10169 if (! NILP (bom) && ! EQ (bom, Qt))
10171 CHECK_CONS (bom);
10172 val = XCAR (bom);
10173 CHECK_CODING_SYSTEM (val);
10174 val = XCDR (bom);
10175 CHECK_CODING_SYSTEM (val);
10177 ASET (attrs, coding_attr_utf_bom, bom);
10179 endian = args[coding_arg_utf16_endian];
10180 CHECK_SYMBOL (endian);
10181 if (NILP (endian))
10182 endian = Qbig;
10183 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10184 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10185 ASET (attrs, coding_attr_utf_16_endian, endian);
10187 category = (CONSP (bom)
10188 ? coding_category_utf_16_auto
10189 : NILP (bom)
10190 ? (EQ (endian, Qbig)
10191 ? coding_category_utf_16_be_nosig
10192 : coding_category_utf_16_le_nosig)
10193 : (EQ (endian, Qbig)
10194 ? coding_category_utf_16_be
10195 : coding_category_utf_16_le));
10197 else if (EQ (coding_type, Qiso_2022))
10199 Lisp_Object initial, reg_usage, request, flags;
10201 if (nargs < coding_arg_iso2022_max)
10202 goto short_args;
10204 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10205 CHECK_VECTOR (initial);
10206 for (i = 0; i < 4; i++)
10208 val = AREF (initial, i);
10209 if (! NILP (val))
10211 struct charset *charset;
10213 CHECK_CHARSET_GET_CHARSET (val, charset);
10214 ASET (initial, i, make_number (CHARSET_ID (charset)));
10215 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10216 ASET (attrs, coding_attr_ascii_compat, Qt);
10218 else
10219 ASET (initial, i, make_number (-1));
10222 reg_usage = args[coding_arg_iso2022_reg_usage];
10223 CHECK_CONS (reg_usage);
10224 CHECK_NUMBER_CAR (reg_usage);
10225 CHECK_NUMBER_CDR (reg_usage);
10227 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10228 for (tail = request; CONSP (tail); tail = XCDR (tail))
10230 int id;
10231 Lisp_Object tmp1;
10233 val = XCAR (tail);
10234 CHECK_CONS (val);
10235 tmp1 = XCAR (val);
10236 CHECK_CHARSET_GET_ID (tmp1, id);
10237 CHECK_NATNUM_CDR (val);
10238 if (XINT (XCDR (val)) >= 4)
10239 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10240 XSETCAR (val, make_number (id));
10243 flags = args[coding_arg_iso2022_flags];
10244 CHECK_NATNUM (flags);
10245 i = XINT (flags) & INT_MAX;
10246 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10247 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10248 flags = make_number (i);
10250 ASET (attrs, coding_attr_iso_initial, initial);
10251 ASET (attrs, coding_attr_iso_usage, reg_usage);
10252 ASET (attrs, coding_attr_iso_request, request);
10253 ASET (attrs, coding_attr_iso_flags, flags);
10254 setup_iso_safe_charsets (attrs);
10256 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10257 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10258 | CODING_ISO_FLAG_SINGLE_SHIFT))
10259 ? coding_category_iso_7_else
10260 : EQ (args[coding_arg_charset_list], Qiso_2022)
10261 ? coding_category_iso_7
10262 : coding_category_iso_7_tight);
10263 else
10265 int id = XINT (AREF (initial, 1));
10267 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10268 || EQ (args[coding_arg_charset_list], Qiso_2022)
10269 || id < 0)
10270 ? coding_category_iso_8_else
10271 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10272 ? coding_category_iso_8_1
10273 : coding_category_iso_8_2);
10275 if (category != coding_category_iso_8_1
10276 && category != coding_category_iso_8_2)
10277 ASET (attrs, coding_attr_ascii_compat, Qnil);
10279 else if (EQ (coding_type, Qemacs_mule))
10281 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10282 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10283 ASET (attrs, coding_attr_ascii_compat, Qt);
10284 category = coding_category_emacs_mule;
10286 else if (EQ (coding_type, Qshift_jis))
10289 struct charset *charset;
10291 if (XINT (Flength (charset_list)) != 3
10292 && XINT (Flength (charset_list)) != 4)
10293 error ("There should be three or four charsets");
10295 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10296 if (CHARSET_DIMENSION (charset) != 1)
10297 error ("Dimension of charset %s is not one",
10298 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10299 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10300 ASET (attrs, coding_attr_ascii_compat, Qt);
10302 charset_list = XCDR (charset_list);
10303 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10304 if (CHARSET_DIMENSION (charset) != 1)
10305 error ("Dimension of charset %s is not one",
10306 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10308 charset_list = XCDR (charset_list);
10309 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10310 if (CHARSET_DIMENSION (charset) != 2)
10311 error ("Dimension of charset %s is not two",
10312 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10314 charset_list = XCDR (charset_list);
10315 if (! NILP (charset_list))
10317 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10318 if (CHARSET_DIMENSION (charset) != 2)
10319 error ("Dimension of charset %s is not two",
10320 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10323 category = coding_category_sjis;
10324 Vsjis_coding_system = name;
10326 else if (EQ (coding_type, Qbig5))
10328 struct charset *charset;
10330 if (XINT (Flength (charset_list)) != 2)
10331 error ("There should be just two charsets");
10333 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10334 if (CHARSET_DIMENSION (charset) != 1)
10335 error ("Dimension of charset %s is not one",
10336 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10337 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10338 ASET (attrs, coding_attr_ascii_compat, Qt);
10340 charset_list = XCDR (charset_list);
10341 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10342 if (CHARSET_DIMENSION (charset) != 2)
10343 error ("Dimension of charset %s is not two",
10344 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10346 category = coding_category_big5;
10347 Vbig5_coding_system = name;
10349 else if (EQ (coding_type, Qraw_text))
10351 category = coding_category_raw_text;
10352 ASET (attrs, coding_attr_ascii_compat, Qt);
10354 else if (EQ (coding_type, Qutf_8))
10356 Lisp_Object bom;
10358 if (nargs < coding_arg_utf8_max)
10359 goto short_args;
10361 bom = args[coding_arg_utf8_bom];
10362 if (! NILP (bom) && ! EQ (bom, Qt))
10364 CHECK_CONS (bom);
10365 val = XCAR (bom);
10366 CHECK_CODING_SYSTEM (val);
10367 val = XCDR (bom);
10368 CHECK_CODING_SYSTEM (val);
10370 ASET (attrs, coding_attr_utf_bom, bom);
10371 if (NILP (bom))
10372 ASET (attrs, coding_attr_ascii_compat, Qt);
10374 category = (CONSP (bom) ? coding_category_utf_8_auto
10375 : NILP (bom) ? coding_category_utf_8_nosig
10376 : coding_category_utf_8_sig);
10378 else if (EQ (coding_type, Qundecided))
10380 if (nargs < coding_arg_undecided_max)
10381 goto short_args;
10382 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10383 args[coding_arg_undecided_inhibit_null_byte_detection]);
10384 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10385 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10386 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10387 args[coding_arg_undecided_prefer_utf_8]);
10388 category = coding_category_undecided;
10390 else
10391 error ("Invalid coding system type: %s",
10392 SDATA (SYMBOL_NAME (coding_type)));
10394 ASET (attrs, coding_attr_category, make_number (category));
10395 ASET (attrs, coding_attr_plist,
10396 Fcons (QCcategory,
10397 Fcons (AREF (Vcoding_category_table, category),
10398 CODING_ATTR_PLIST (attrs))));
10399 ASET (attrs, coding_attr_plist,
10400 Fcons (QCascii_compatible_p,
10401 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10402 CODING_ATTR_PLIST (attrs))));
10404 eol_type = args[coding_arg_eol_type];
10405 if (! NILP (eol_type)
10406 && ! EQ (eol_type, Qunix)
10407 && ! EQ (eol_type, Qdos)
10408 && ! EQ (eol_type, Qmac))
10409 error ("Invalid eol-type");
10411 aliases = list1 (name);
10413 if (NILP (eol_type))
10415 eol_type = make_subsidiaries (name);
10416 for (i = 0; i < 3; i++)
10418 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10420 this_name = AREF (eol_type, i);
10421 this_aliases = list1 (this_name);
10422 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10423 this_spec = make_uninit_vector (3);
10424 ASET (this_spec, 0, attrs);
10425 ASET (this_spec, 1, this_aliases);
10426 ASET (this_spec, 2, this_eol_type);
10427 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10428 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10429 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10430 if (NILP (val))
10431 Vcoding_system_alist
10432 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10433 Vcoding_system_alist);
10437 spec_vec = make_uninit_vector (3);
10438 ASET (spec_vec, 0, attrs);
10439 ASET (spec_vec, 1, aliases);
10440 ASET (spec_vec, 2, eol_type);
10442 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10443 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10444 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10445 if (NILP (val))
10446 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10447 Vcoding_system_alist);
10450 int id = coding_categories[category].id;
10452 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10453 setup_coding_system (name, &coding_categories[category]);
10456 return Qnil;
10458 short_args:
10459 return Fsignal (Qwrong_number_of_arguments,
10460 Fcons (intern ("define-coding-system-internal"),
10461 make_number (nargs)));
10465 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10466 3, 3, 0,
10467 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10468 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10470 Lisp_Object spec, attrs;
10472 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10473 attrs = AREF (spec, 0);
10474 if (EQ (prop, QCmnemonic))
10476 if (! STRINGP (val))
10477 CHECK_CHARACTER (val);
10478 ASET (attrs, coding_attr_mnemonic, val);
10480 else if (EQ (prop, QCdefault_char))
10482 if (NILP (val))
10483 val = make_number (' ');
10484 else
10485 CHECK_CHARACTER (val);
10486 ASET (attrs, coding_attr_default_char, val);
10488 else if (EQ (prop, QCdecode_translation_table))
10490 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10491 CHECK_SYMBOL (val);
10492 ASET (attrs, coding_attr_decode_tbl, val);
10494 else if (EQ (prop, QCencode_translation_table))
10496 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10497 CHECK_SYMBOL (val);
10498 ASET (attrs, coding_attr_encode_tbl, val);
10500 else if (EQ (prop, QCpost_read_conversion))
10502 CHECK_SYMBOL (val);
10503 ASET (attrs, coding_attr_post_read, val);
10505 else if (EQ (prop, QCpre_write_conversion))
10507 CHECK_SYMBOL (val);
10508 ASET (attrs, coding_attr_pre_write, val);
10510 else if (EQ (prop, QCascii_compatible_p))
10512 ASET (attrs, coding_attr_ascii_compat, val);
10515 ASET (attrs, coding_attr_plist,
10516 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10517 return val;
10521 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10522 Sdefine_coding_system_alias, 2, 2, 0,
10523 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10524 (Lisp_Object alias, Lisp_Object coding_system)
10526 Lisp_Object spec, aliases, eol_type, val;
10528 CHECK_SYMBOL (alias);
10529 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10530 aliases = AREF (spec, 1);
10531 /* ALIASES should be a list of length more than zero, and the first
10532 element is a base coding system. Append ALIAS at the tail of the
10533 list. */
10534 while (!NILP (XCDR (aliases)))
10535 aliases = XCDR (aliases);
10536 XSETCDR (aliases, list1 (alias));
10538 eol_type = AREF (spec, 2);
10539 if (VECTORP (eol_type))
10541 Lisp_Object subsidiaries;
10542 int i;
10544 subsidiaries = make_subsidiaries (alias);
10545 for (i = 0; i < 3; i++)
10546 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10547 AREF (eol_type, i));
10550 Fputhash (alias, spec, Vcoding_system_hash_table);
10551 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10552 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10553 if (NILP (val))
10554 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10555 Vcoding_system_alist);
10557 return Qnil;
10560 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10561 1, 1, 0,
10562 doc: /* Return the base of CODING-SYSTEM.
10563 Any alias or subsidiary coding system is not a base coding system. */)
10564 (Lisp_Object coding_system)
10566 Lisp_Object spec, attrs;
10568 if (NILP (coding_system))
10569 return (Qno_conversion);
10570 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10571 attrs = AREF (spec, 0);
10572 return CODING_ATTR_BASE_NAME (attrs);
10575 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10576 1, 1, 0,
10577 doc: "Return the property list of CODING-SYSTEM.")
10578 (Lisp_Object coding_system)
10580 Lisp_Object spec, attrs;
10582 if (NILP (coding_system))
10583 coding_system = Qno_conversion;
10584 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10585 attrs = AREF (spec, 0);
10586 return CODING_ATTR_PLIST (attrs);
10590 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10591 1, 1, 0,
10592 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10593 (Lisp_Object coding_system)
10595 Lisp_Object spec;
10597 if (NILP (coding_system))
10598 coding_system = Qno_conversion;
10599 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10600 return AREF (spec, 1);
10603 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10604 Scoding_system_eol_type, 1, 1, 0,
10605 doc: /* Return eol-type of CODING-SYSTEM.
10606 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10608 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10609 and CR respectively.
10611 A vector value indicates that a format of end-of-line should be
10612 detected automatically. Nth element of the vector is the subsidiary
10613 coding system whose eol-type is N. */)
10614 (Lisp_Object coding_system)
10616 Lisp_Object spec, eol_type;
10617 int n;
10619 if (NILP (coding_system))
10620 coding_system = Qno_conversion;
10621 if (! CODING_SYSTEM_P (coding_system))
10622 return Qnil;
10623 spec = CODING_SYSTEM_SPEC (coding_system);
10624 eol_type = AREF (spec, 2);
10625 if (VECTORP (eol_type))
10626 return Fcopy_sequence (eol_type);
10627 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10628 return make_number (n);
10631 #endif /* emacs */
10634 /*** 9. Post-amble ***/
10636 void
10637 init_coding_once (void)
10639 int i;
10641 for (i = 0; i < coding_category_max; i++)
10643 coding_categories[i].id = -1;
10644 coding_priorities[i] = i;
10647 /* ISO2022 specific initialize routine. */
10648 for (i = 0; i < 0x20; i++)
10649 iso_code_class[i] = ISO_control_0;
10650 for (i = 0x21; i < 0x7F; i++)
10651 iso_code_class[i] = ISO_graphic_plane_0;
10652 for (i = 0x80; i < 0xA0; i++)
10653 iso_code_class[i] = ISO_control_1;
10654 for (i = 0xA1; i < 0xFF; i++)
10655 iso_code_class[i] = ISO_graphic_plane_1;
10656 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10657 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10658 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10659 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10660 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10661 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10662 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10663 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10664 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10666 for (i = 0; i < 256; i++)
10668 emacs_mule_bytes[i] = 1;
10670 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10671 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10672 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10673 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10676 #ifdef emacs
10678 void
10679 syms_of_coding (void)
10681 staticpro (&Vcoding_system_hash_table);
10683 Lisp_Object args[2];
10684 args[0] = QCtest;
10685 args[1] = Qeq;
10686 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10689 staticpro (&Vsjis_coding_system);
10690 Vsjis_coding_system = Qnil;
10692 staticpro (&Vbig5_coding_system);
10693 Vbig5_coding_system = Qnil;
10695 staticpro (&Vcode_conversion_reused_workbuf);
10696 Vcode_conversion_reused_workbuf = Qnil;
10698 staticpro (&Vcode_conversion_workbuf_name);
10699 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10701 reused_workbuf_in_use = 0;
10703 DEFSYM (Qcharset, "charset");
10704 DEFSYM (Qtarget_idx, "target-idx");
10705 DEFSYM (Qcoding_system_history, "coding-system-history");
10706 Fset (Qcoding_system_history, Qnil);
10708 /* Target FILENAME is the first argument. */
10709 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10710 /* Target FILENAME is the third argument. */
10711 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10713 DEFSYM (Qcall_process, "call-process");
10714 /* Target PROGRAM is the first argument. */
10715 Fput (Qcall_process, Qtarget_idx, make_number (0));
10717 DEFSYM (Qcall_process_region, "call-process-region");
10718 /* Target PROGRAM is the third argument. */
10719 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10721 DEFSYM (Qstart_process, "start-process");
10722 /* Target PROGRAM is the third argument. */
10723 Fput (Qstart_process, Qtarget_idx, make_number (2));
10725 DEFSYM (Qopen_network_stream, "open-network-stream");
10726 /* Target SERVICE is the fourth argument. */
10727 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10729 DEFSYM (Qcoding_system, "coding-system");
10730 DEFSYM (Qcoding_aliases, "coding-aliases");
10732 DEFSYM (Qeol_type, "eol-type");
10733 DEFSYM (Qunix, "unix");
10734 DEFSYM (Qdos, "dos");
10735 DEFSYM (Qmac, "mac");
10737 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10738 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10739 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10740 DEFSYM (Qdefault_char, "default-char");
10741 DEFSYM (Qundecided, "undecided");
10742 DEFSYM (Qno_conversion, "no-conversion");
10743 DEFSYM (Qraw_text, "raw-text");
10745 DEFSYM (Qiso_2022, "iso-2022");
10747 DEFSYM (Qutf_8, "utf-8");
10748 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10750 #if defined (WINDOWSNT) || defined (CYGWIN)
10751 /* No, not utf-16-le: that one has a BOM. */
10752 DEFSYM (Qutf_16le, "utf-16le");
10753 #endif
10755 DEFSYM (Qutf_16, "utf-16");
10756 DEFSYM (Qbig, "big");
10757 DEFSYM (Qlittle, "little");
10759 DEFSYM (Qshift_jis, "shift-jis");
10760 DEFSYM (Qbig5, "big5");
10762 DEFSYM (Qcoding_system_p, "coding-system-p");
10764 DEFSYM (Qcoding_system_error, "coding-system-error");
10765 Fput (Qcoding_system_error, Qerror_conditions,
10766 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10767 Fput (Qcoding_system_error, Qerror_message,
10768 build_pure_c_string ("Invalid coding system"));
10770 /* Intern this now in case it isn't already done.
10771 Setting this variable twice is harmless.
10772 But don't staticpro it here--that is done in alloc.c. */
10773 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10775 DEFSYM (Qtranslation_table, "translation-table");
10776 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10777 DEFSYM (Qtranslation_table_id, "translation-table-id");
10778 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10779 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10781 DEFSYM (Qvalid_codes, "valid-codes");
10783 DEFSYM (Qemacs_mule, "emacs-mule");
10785 DEFSYM (QCcategory, ":category");
10786 DEFSYM (QCmnemonic, ":mnemonic");
10787 DEFSYM (QCdefault_char, ":default-char");
10788 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10789 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10790 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10791 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10792 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10794 Vcoding_category_table
10795 = Fmake_vector (make_number (coding_category_max), Qnil);
10796 staticpro (&Vcoding_category_table);
10797 /* Followings are target of code detection. */
10798 ASET (Vcoding_category_table, coding_category_iso_7,
10799 intern_c_string ("coding-category-iso-7"));
10800 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10801 intern_c_string ("coding-category-iso-7-tight"));
10802 ASET (Vcoding_category_table, coding_category_iso_8_1,
10803 intern_c_string ("coding-category-iso-8-1"));
10804 ASET (Vcoding_category_table, coding_category_iso_8_2,
10805 intern_c_string ("coding-category-iso-8-2"));
10806 ASET (Vcoding_category_table, coding_category_iso_7_else,
10807 intern_c_string ("coding-category-iso-7-else"));
10808 ASET (Vcoding_category_table, coding_category_iso_8_else,
10809 intern_c_string ("coding-category-iso-8-else"));
10810 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10811 intern_c_string ("coding-category-utf-8-auto"));
10812 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10813 intern_c_string ("coding-category-utf-8"));
10814 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10815 intern_c_string ("coding-category-utf-8-sig"));
10816 ASET (Vcoding_category_table, coding_category_utf_16_be,
10817 intern_c_string ("coding-category-utf-16-be"));
10818 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10819 intern_c_string ("coding-category-utf-16-auto"));
10820 ASET (Vcoding_category_table, coding_category_utf_16_le,
10821 intern_c_string ("coding-category-utf-16-le"));
10822 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10823 intern_c_string ("coding-category-utf-16-be-nosig"));
10824 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10825 intern_c_string ("coding-category-utf-16-le-nosig"));
10826 ASET (Vcoding_category_table, coding_category_charset,
10827 intern_c_string ("coding-category-charset"));
10828 ASET (Vcoding_category_table, coding_category_sjis,
10829 intern_c_string ("coding-category-sjis"));
10830 ASET (Vcoding_category_table, coding_category_big5,
10831 intern_c_string ("coding-category-big5"));
10832 ASET (Vcoding_category_table, coding_category_ccl,
10833 intern_c_string ("coding-category-ccl"));
10834 ASET (Vcoding_category_table, coding_category_emacs_mule,
10835 intern_c_string ("coding-category-emacs-mule"));
10836 /* Followings are NOT target of code detection. */
10837 ASET (Vcoding_category_table, coding_category_raw_text,
10838 intern_c_string ("coding-category-raw-text"));
10839 ASET (Vcoding_category_table, coding_category_undecided,
10840 intern_c_string ("coding-category-undecided"));
10842 DEFSYM (Qinsufficient_source, "insufficient-source");
10843 DEFSYM (Qinvalid_source, "invalid-source");
10844 DEFSYM (Qinterrupted, "interrupted");
10845 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10847 defsubr (&Scoding_system_p);
10848 defsubr (&Sread_coding_system);
10849 defsubr (&Sread_non_nil_coding_system);
10850 defsubr (&Scheck_coding_system);
10851 defsubr (&Sdetect_coding_region);
10852 defsubr (&Sdetect_coding_string);
10853 defsubr (&Sfind_coding_systems_region_internal);
10854 defsubr (&Sunencodable_char_position);
10855 defsubr (&Scheck_coding_systems_region);
10856 defsubr (&Sdecode_coding_region);
10857 defsubr (&Sencode_coding_region);
10858 defsubr (&Sdecode_coding_string);
10859 defsubr (&Sencode_coding_string);
10860 defsubr (&Sdecode_sjis_char);
10861 defsubr (&Sencode_sjis_char);
10862 defsubr (&Sdecode_big5_char);
10863 defsubr (&Sencode_big5_char);
10864 defsubr (&Sset_terminal_coding_system_internal);
10865 defsubr (&Sset_safe_terminal_coding_system_internal);
10866 defsubr (&Sterminal_coding_system);
10867 defsubr (&Sset_keyboard_coding_system_internal);
10868 defsubr (&Skeyboard_coding_system);
10869 defsubr (&Sfind_operation_coding_system);
10870 defsubr (&Sset_coding_system_priority);
10871 defsubr (&Sdefine_coding_system_internal);
10872 defsubr (&Sdefine_coding_system_alias);
10873 defsubr (&Scoding_system_put);
10874 defsubr (&Scoding_system_base);
10875 defsubr (&Scoding_system_plist);
10876 defsubr (&Scoding_system_aliases);
10877 defsubr (&Scoding_system_eol_type);
10878 defsubr (&Scoding_system_priority_list);
10880 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10881 doc: /* List of coding systems.
10883 Do not alter the value of this variable manually. This variable should be
10884 updated by the functions `define-coding-system' and
10885 `define-coding-system-alias'. */);
10886 Vcoding_system_list = Qnil;
10888 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10889 doc: /* Alist of coding system names.
10890 Each element is one element list of coding system name.
10891 This variable is given to `completing-read' as COLLECTION argument.
10893 Do not alter the value of this variable manually. This variable should be
10894 updated by the functions `make-coding-system' and
10895 `define-coding-system-alias'. */);
10896 Vcoding_system_alist = Qnil;
10898 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10899 doc: /* List of coding-categories (symbols) ordered by priority.
10901 On detecting a coding system, Emacs tries code detection algorithms
10902 associated with each coding-category one by one in this order. When
10903 one algorithm agrees with a byte sequence of source text, the coding
10904 system bound to the corresponding coding-category is selected.
10906 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10908 int i;
10910 Vcoding_category_list = Qnil;
10911 for (i = coding_category_max - 1; i >= 0; i--)
10912 Vcoding_category_list
10913 = Fcons (AREF (Vcoding_category_table, i),
10914 Vcoding_category_list);
10917 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10918 doc: /* Specify the coding system for read operations.
10919 It is useful to bind this variable with `let', but do not set it globally.
10920 If the value is a coding system, it is used for decoding on read operation.
10921 If not, an appropriate element is used from one of the coding system alists.
10922 There are three such tables: `file-coding-system-alist',
10923 `process-coding-system-alist', and `network-coding-system-alist'. */);
10924 Vcoding_system_for_read = Qnil;
10926 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10927 doc: /* Specify the coding system for write operations.
10928 Programs bind this variable with `let', but you should not set it globally.
10929 If the value is a coding system, it is used for encoding of output,
10930 when writing it to a file and when sending it to a file or subprocess.
10932 If this does not specify a coding system, an appropriate element
10933 is used from one of the coding system alists.
10934 There are three such tables: `file-coding-system-alist',
10935 `process-coding-system-alist', and `network-coding-system-alist'.
10936 For output to files, if the above procedure does not specify a coding system,
10937 the value of `buffer-file-coding-system' is used. */);
10938 Vcoding_system_for_write = Qnil;
10940 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10941 doc: /*
10942 Coding system used in the latest file or process I/O. */);
10943 Vlast_coding_system_used = Qnil;
10945 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10946 doc: /*
10947 Error status of the last code conversion.
10949 When an error was detected in the last code conversion, this variable
10950 is set to one of the following symbols.
10951 `insufficient-source'
10952 `inconsistent-eol'
10953 `invalid-source'
10954 `interrupted'
10955 `insufficient-memory'
10956 When no error was detected, the value doesn't change. So, to check
10957 the error status of a code conversion by this variable, you must
10958 explicitly set this variable to nil before performing code
10959 conversion. */);
10960 Vlast_code_conversion_error = Qnil;
10962 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10963 doc: /*
10964 *Non-nil means always inhibit code conversion of end-of-line format.
10965 See info node `Coding Systems' and info node `Text and Binary' concerning
10966 such conversion. */);
10967 inhibit_eol_conversion = 0;
10969 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10970 doc: /*
10971 Non-nil means process buffer inherits coding system of process output.
10972 Bind it to t if the process output is to be treated as if it were a file
10973 read from some filesystem. */);
10974 inherit_process_coding_system = 0;
10976 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10977 doc: /*
10978 Alist to decide a coding system to use for a file I/O operation.
10979 The format is ((PATTERN . VAL) ...),
10980 where PATTERN is a regular expression matching a file name,
10981 VAL is a coding system, a cons of coding systems, or a function symbol.
10982 If VAL is a coding system, it is used for both decoding and encoding
10983 the file contents.
10984 If VAL is a cons of coding systems, the car part is used for decoding,
10985 and the cdr part is used for encoding.
10986 If VAL is a function symbol, the function must return a coding system
10987 or a cons of coding systems which are used as above. The function is
10988 called with an argument that is a list of the arguments with which
10989 `find-operation-coding-system' was called. If the function can't decide
10990 a coding system, it can return `undecided' so that the normal
10991 code-detection is performed.
10993 See also the function `find-operation-coding-system'
10994 and the variable `auto-coding-alist'. */);
10995 Vfile_coding_system_alist = Qnil;
10997 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10998 doc: /*
10999 Alist to decide a coding system to use for a process I/O operation.
11000 The format is ((PATTERN . VAL) ...),
11001 where PATTERN is a regular expression matching a program name,
11002 VAL is a coding system, a cons of coding systems, or a function symbol.
11003 If VAL is a coding system, it is used for both decoding what received
11004 from the program and encoding what sent to the program.
11005 If VAL is a cons of coding systems, the car part is used for decoding,
11006 and the cdr part is used for encoding.
11007 If VAL is a function symbol, the function must return a coding system
11008 or a cons of coding systems which are used as above.
11010 See also the function `find-operation-coding-system'. */);
11011 Vprocess_coding_system_alist = Qnil;
11013 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11014 doc: /*
11015 Alist to decide a coding system to use for a network I/O operation.
11016 The format is ((PATTERN . VAL) ...),
11017 where PATTERN is a regular expression matching a network service name
11018 or is a port number to connect to,
11019 VAL is a coding system, a cons of coding systems, or a function symbol.
11020 If VAL is a coding system, it is used for both decoding what received
11021 from the network stream and encoding what sent to the network stream.
11022 If VAL is a cons of coding systems, the car part is used for decoding,
11023 and the cdr part is used for encoding.
11024 If VAL is a function symbol, the function must return a coding system
11025 or a cons of coding systems which are used as above.
11027 See also the function `find-operation-coding-system'. */);
11028 Vnetwork_coding_system_alist = Qnil;
11030 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11031 doc: /* Coding system to use with system messages.
11032 Also used for decoding keyboard input on X Window system. */);
11033 Vlocale_coding_system = Qnil;
11035 /* The eol mnemonics are reset in startup.el system-dependently. */
11036 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11037 doc: /*
11038 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11039 eol_mnemonic_unix = build_pure_c_string (":");
11041 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11042 doc: /*
11043 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11044 eol_mnemonic_dos = build_pure_c_string ("\\");
11046 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11047 doc: /*
11048 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11049 eol_mnemonic_mac = build_pure_c_string ("/");
11051 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11052 doc: /*
11053 *String displayed in mode line when end-of-line format is not yet determined. */);
11054 eol_mnemonic_undecided = build_pure_c_string (":");
11056 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11057 doc: /*
11058 *Non-nil enables character translation while encoding and decoding. */);
11059 Venable_character_translation = Qt;
11061 DEFVAR_LISP ("standard-translation-table-for-decode",
11062 Vstandard_translation_table_for_decode,
11063 doc: /* Table for translating characters while decoding. */);
11064 Vstandard_translation_table_for_decode = Qnil;
11066 DEFVAR_LISP ("standard-translation-table-for-encode",
11067 Vstandard_translation_table_for_encode,
11068 doc: /* Table for translating characters while encoding. */);
11069 Vstandard_translation_table_for_encode = Qnil;
11071 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11072 doc: /* Alist of charsets vs revision numbers.
11073 While encoding, if a charset (car part of an element) is found,
11074 designate it with the escape sequence identifying revision (cdr part
11075 of the element). */);
11076 Vcharset_revision_table = Qnil;
11078 DEFVAR_LISP ("default-process-coding-system",
11079 Vdefault_process_coding_system,
11080 doc: /* Cons of coding systems used for process I/O by default.
11081 The car part is used for decoding a process output,
11082 the cdr part is used for encoding a text to be sent to a process. */);
11083 Vdefault_process_coding_system = Qnil;
11085 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11086 doc: /*
11087 Table of extra Latin codes in the range 128..159 (inclusive).
11088 This is a vector of length 256.
11089 If Nth element is non-nil, the existence of code N in a file
11090 \(or output of subprocess) doesn't prevent it to be detected as
11091 a coding system of ISO 2022 variant which has a flag
11092 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11093 or reading output of a subprocess.
11094 Only 128th through 159th elements have a meaning. */);
11095 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11097 DEFVAR_LISP ("select-safe-coding-system-function",
11098 Vselect_safe_coding_system_function,
11099 doc: /*
11100 Function to call to select safe coding system for encoding a text.
11102 If set, this function is called to force a user to select a proper
11103 coding system which can encode the text in the case that a default
11104 coding system used in each operation can't encode the text. The
11105 function should take care that the buffer is not modified while
11106 the coding system is being selected.
11108 The default value is `select-safe-coding-system' (which see). */);
11109 Vselect_safe_coding_system_function = Qnil;
11111 DEFVAR_BOOL ("coding-system-require-warning",
11112 coding_system_require_warning,
11113 doc: /* Internal use only.
11114 If non-nil, on writing a file, `select-safe-coding-system-function' is
11115 called even if `coding-system-for-write' is non-nil. The command
11116 `universal-coding-system-argument' binds this variable to t temporarily. */);
11117 coding_system_require_warning = 0;
11120 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11121 inhibit_iso_escape_detection,
11122 doc: /*
11123 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11125 When Emacs reads text, it tries to detect how the text is encoded.
11126 This code detection is sensitive to escape sequences. If Emacs sees
11127 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11128 of the ISO2022 encodings, and decodes text by the corresponding coding
11129 system (e.g. `iso-2022-7bit').
11131 However, there may be a case that you want to read escape sequences in
11132 a file as is. In such a case, you can set this variable to non-nil.
11133 Then the code detection will ignore any escape sequences, and no text is
11134 detected as encoded in some ISO-2022 encoding. The result is that all
11135 escape sequences become visible in a buffer.
11137 The default value is nil, and it is strongly recommended not to change
11138 it. That is because many Emacs Lisp source files that contain
11139 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11140 in Emacs's distribution, and they won't be decoded correctly on
11141 reading if you suppress escape sequence detection.
11143 The other way to read escape sequences in a file without decoding is
11144 to explicitly specify some coding system that doesn't use ISO-2022
11145 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11146 inhibit_iso_escape_detection = 0;
11148 DEFVAR_BOOL ("inhibit-null-byte-detection",
11149 inhibit_null_byte_detection,
11150 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11151 By default, Emacs treats it as binary data, and does not attempt to
11152 decode it. The effect is as if you specified `no-conversion' for
11153 reading that text.
11155 Set this to non-nil when a regular text happens to include null bytes.
11156 Examples are Index nodes of Info files and null-byte delimited output
11157 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11158 decode text as usual. */);
11159 inhibit_null_byte_detection = 0;
11161 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11162 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11163 Internal use only. Removed after the experimental optimizer gets stable. */);
11164 disable_ascii_optimization = 0;
11166 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11167 doc: /* Char table for translating self-inserting characters.
11168 This is applied to the result of input methods, not their input.
11169 See also `keyboard-translate-table'.
11171 Use of this variable for character code unification was rendered
11172 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11173 internal character representation. */);
11174 Vtranslation_table_for_input = Qnil;
11177 Lisp_Object args[coding_arg_undecided_max];
11178 Lisp_Object plist[16];
11179 int i;
11181 for (i = 0; i < coding_arg_undecided_max; i++)
11182 args[i] = Qnil;
11184 plist[0] = intern_c_string (":name");
11185 plist[1] = args[coding_arg_name] = Qno_conversion;
11186 plist[2] = intern_c_string (":mnemonic");
11187 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11188 plist[4] = intern_c_string (":coding-type");
11189 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11190 plist[6] = intern_c_string (":ascii-compatible-p");
11191 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11192 plist[8] = intern_c_string (":default-char");
11193 plist[9] = args[coding_arg_default_char] = make_number (0);
11194 plist[10] = intern_c_string (":for-unibyte");
11195 plist[11] = args[coding_arg_for_unibyte] = Qt;
11196 plist[12] = intern_c_string (":docstring");
11197 plist[13] = build_pure_c_string ("Do no conversion.\n\
11199 When you visit a file with this coding, the file is read into a\n\
11200 unibyte buffer as is, thus each byte of a file is treated as a\n\
11201 character.");
11202 plist[14] = intern_c_string (":eol-type");
11203 plist[15] = args[coding_arg_eol_type] = Qunix;
11204 args[coding_arg_plist] = Flist (16, plist);
11205 Fdefine_coding_system_internal (coding_arg_max, args);
11207 plist[1] = args[coding_arg_name] = Qundecided;
11208 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11209 plist[5] = args[coding_arg_coding_type] = Qundecided;
11210 /* This is already set.
11211 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11212 plist[8] = intern_c_string (":charset-list");
11213 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11214 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11215 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11216 plist[15] = args[coding_arg_eol_type] = Qnil;
11217 args[coding_arg_plist] = Flist (16, plist);
11218 args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11219 args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11220 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11223 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11226 int i;
11228 for (i = 0; i < coding_category_max; i++)
11229 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11231 #if defined (DOS_NT)
11232 system_eol_type = Qdos;
11233 #else
11234 system_eol_type = Qunix;
11235 #endif
11236 staticpro (&system_eol_type);
11239 char *
11240 emacs_strerror (int error_number)
11242 char *str;
11244 synchronize_system_messages_locale ();
11245 str = strerror (error_number);
11247 if (! NILP (Vlocale_coding_system))
11249 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11250 Vlocale_coding_system,
11252 str = SSDATA (dec);
11255 return str;
11258 #endif /* emacs */