Added install instructions using MSYS.
[emacs.git] / src / coding.c
blobf9799035b3c293b9c02b4106f81373fe25bc4509
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
525 /* Index for each coding category in `coding_categories' */
527 enum coding_category
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
643 /*** Commonly used macros and functions ***/
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
652 #define CODING_GET_INFO(coding, attrs, charset_list) \
653 do { \
654 (attrs) = CODING_ID_ATTRS ((coding)->id); \
655 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
656 } while (0)
659 /* Safely get one byte from the source text pointed by SRC which ends
660 at SRC_END, and set C to that byte. If there are not enough bytes
661 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
662 and a multibyte character is found at SRC, set C to the
663 negative value of the character code. The caller should declare
664 and set these variables appropriately in advance:
665 src, src_end, multibytep */
667 #define ONE_MORE_BYTE(c) \
668 do { \
669 if (src == src_end) \
671 if (src_base < src) \
672 record_conversion_result \
673 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
674 goto no_more_source; \
676 c = *src++; \
677 if (multibytep && (c & 0x80)) \
679 if ((c & 0xFE) == 0xC0) \
680 c = ((c & 1) << 6) | *src++; \
681 else \
683 src--; \
684 c = - string_char (src, &src, NULL); \
685 record_conversion_result \
686 (coding, CODING_RESULT_INVALID_SRC); \
689 consumed_chars++; \
690 } while (0)
692 /* Safely get two bytes from the source text pointed by SRC which ends
693 at SRC_END, and set C1 and C2 to those bytes while skipping the
694 heading multibyte characters. If there are not enough bytes in the
695 source, it jumps to 'no_more_source'. If MULTIBYTEP and
696 a multibyte character is found for C2, set C2 to the negative value
697 of the character code. The caller should declare and set these
698 variables appropriately in advance:
699 src, src_end, multibytep
700 It is intended that this macro is used in detect_coding_utf_16. */
702 #define TWO_MORE_BYTES(c1, c2) \
703 do { \
704 do { \
705 if (src == src_end) \
706 goto no_more_source; \
707 c1 = *src++; \
708 if (multibytep && (c1 & 0x80)) \
710 if ((c1 & 0xFE) == 0xC0) \
711 c1 = ((c1 & 1) << 6) | *src++; \
712 else \
714 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
715 c1 = -1; \
718 } while (c1 < 0); \
719 if (src == src_end) \
720 goto no_more_source; \
721 c2 = *src++; \
722 if (multibytep && (c2 & 0x80)) \
724 if ((c2 & 0xFE) == 0xC0) \
725 c2 = ((c2 & 1) << 6) | *src++; \
726 else \
727 c2 = -1; \
729 } while (0)
732 /* Store a byte C in the place pointed by DST and increment DST to the
733 next free point, and increment PRODUCED_CHARS. The caller should
734 assure that C is 0..127, and declare and set the variable `dst'
735 appropriately in advance.
739 #define EMIT_ONE_ASCII_BYTE(c) \
740 do { \
741 produced_chars++; \
742 *dst++ = (c); \
743 } while (0)
746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
748 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
749 do { \
750 produced_chars += 2; \
751 *dst++ = (c1), *dst++ = (c2); \
752 } while (0)
755 /* Store a byte C in the place pointed by DST and increment DST to the
756 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
757 store in an appropriate multibyte form. The caller should
758 declare and set the variables `dst' and `multibytep' appropriately
759 in advance. */
761 #define EMIT_ONE_BYTE(c) \
762 do { \
763 produced_chars++; \
764 if (multibytep) \
766 unsigned ch = (c); \
767 if (ch >= 0x80) \
768 ch = BYTE8_TO_CHAR (ch); \
769 CHAR_STRING_ADVANCE (ch, dst); \
771 else \
772 *dst++ = (c); \
773 } while (0)
776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
778 #define EMIT_TWO_BYTES(c1, c2) \
779 do { \
780 produced_chars += 2; \
781 if (multibytep) \
783 unsigned ch; \
785 ch = (c1); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 ch = (c2); \
790 if (ch >= 0x80) \
791 ch = BYTE8_TO_CHAR (ch); \
792 CHAR_STRING_ADVANCE (ch, dst); \
794 else \
796 *dst++ = (c1); \
797 *dst++ = (c2); \
799 } while (0)
802 #define EMIT_THREE_BYTES(c1, c2, c3) \
803 do { \
804 EMIT_ONE_BYTE (c1); \
805 EMIT_TWO_BYTES (c2, c3); \
806 } while (0)
809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
810 do { \
811 EMIT_TWO_BYTES (c1, c2); \
812 EMIT_TWO_BYTES (c3, c4); \
813 } while (0)
816 static void
817 record_conversion_result (struct coding_system *coding,
818 enum coding_result_code result)
820 coding->result = result;
821 switch (result)
823 case CODING_RESULT_INSUFFICIENT_SRC:
824 Vlast_code_conversion_error = Qinsufficient_source;
825 break;
826 case CODING_RESULT_INVALID_SRC:
827 Vlast_code_conversion_error = Qinvalid_source;
828 break;
829 case CODING_RESULT_INTERRUPT:
830 Vlast_code_conversion_error = Qinterrupted;
831 break;
832 case CODING_RESULT_INSUFFICIENT_DST:
833 /* Don't record this error in Vlast_code_conversion_error
834 because it happens just temporarily and is resolved when the
835 whole conversion is finished. */
836 break;
837 case CODING_RESULT_SUCCESS:
838 break;
839 default:
840 Vlast_code_conversion_error = intern ("Unknown error");
844 /* These wrapper macros are used to preserve validity of pointers into
845 buffer text across calls to decode_char, encode_char, etc, which
846 could cause relocation of buffers if it loads a charset map,
847 because loading a charset map allocates large structures. */
849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
850 do { \
851 ptrdiff_t offset; \
853 charset_map_loaded = 0; \
854 c = DECODE_CHAR (charset, code); \
855 if (charset_map_loaded \
856 && (offset = coding_change_source (coding))) \
858 src += offset; \
859 src_base += offset; \
860 src_end += offset; \
862 } while (0)
864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
865 do { \
866 ptrdiff_t offset; \
868 charset_map_loaded = 0; \
869 code = ENCODE_CHAR (charset, c); \
870 if (charset_map_loaded \
871 && (offset = coding_change_destination (coding))) \
873 dst += offset; \
874 dst_end += offset; \
876 } while (0)
878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
879 do { \
880 ptrdiff_t offset; \
882 charset_map_loaded = 0; \
883 charset = char_charset (c, charset_list, code_return); \
884 if (charset_map_loaded \
885 && (offset = coding_change_destination (coding))) \
887 dst += offset; \
888 dst_end += offset; \
890 } while (0)
892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
893 do { \
894 ptrdiff_t offset; \
896 charset_map_loaded = 0; \
897 result = CHAR_CHARSET_P (c, charset); \
898 if (charset_map_loaded \
899 && (offset = coding_change_destination (coding))) \
901 dst += offset; \
902 dst_end += offset; \
904 } while (0)
907 /* If there are at least BYTES length of room at dst, allocate memory
908 for coding->destination and update dst and dst_end. We don't have
909 to take care of coding->source which will be relocated. It is
910 handled by calling coding_set_source in encode_coding. */
912 #define ASSURE_DESTINATION(bytes) \
913 do { \
914 if (dst + (bytes) >= dst_end) \
916 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
918 dst = alloc_destination (coding, more_bytes, dst); \
919 dst_end = coding->destination + coding->dst_bytes; \
921 } while (0)
924 /* Store multibyte form of the character C in P, and advance P to the
925 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
926 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
927 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
931 /* Return the character code of character whose multibyte form is at
932 P, and advance P to the end of the multibyte form. This used to be
933 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
934 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
938 /* Set coding->source from coding->src_object. */
940 static void
941 coding_set_source (struct coding_system *coding)
943 if (BUFFERP (coding->src_object))
945 struct buffer *buf = XBUFFER (coding->src_object);
947 if (coding->src_pos < 0)
948 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
949 else
950 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
952 else if (STRINGP (coding->src_object))
954 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
956 else
958 /* Otherwise, the source is C string and is never relocated
959 automatically. Thus we don't have to update anything. */
964 /* Set coding->source from coding->src_object, and return how many
965 bytes coding->source was changed. */
967 static ptrdiff_t
968 coding_change_source (struct coding_system *coding)
970 const unsigned char *orig = coding->source;
971 coding_set_source (coding);
972 return coding->source - orig;
976 /* Set coding->destination from coding->dst_object. */
978 static void
979 coding_set_destination (struct coding_system *coding)
981 if (BUFFERP (coding->dst_object))
983 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
985 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
986 coding->dst_bytes = (GAP_END_ADDR
987 - (coding->src_bytes - coding->consumed)
988 - coding->destination);
990 else
992 /* We are sure that coding->dst_pos_byte is before the gap
993 of the buffer. */
994 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
995 + coding->dst_pos_byte - BEG_BYTE);
996 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
997 - coding->destination);
1000 else
1002 /* Otherwise, the destination is C string and is never relocated
1003 automatically. Thus we don't have to update anything. */
1008 /* Set coding->destination from coding->dst_object, and return how
1009 many bytes coding->destination was changed. */
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1014 const unsigned char *orig = coding->destination;
1015 coding_set_destination (coding);
1016 return coding->destination - orig;
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1023 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024 string_overflow ();
1025 coding->destination = xrealloc (coding->destination,
1026 coding->dst_bytes + bytes);
1027 coding->dst_bytes += bytes;
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1034 if (EQ (coding->src_object, coding->dst_object))
1036 /* The gap may contain the produced data at the head and not-yet
1037 consumed data at the tail. To preserve those data, we at
1038 first make the gap size to zero, then increase the gap
1039 size. */
1040 ptrdiff_t add = GAP_SIZE;
1042 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044 make_gap (bytes);
1045 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1048 else
1049 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055 unsigned char *dst)
1057 ptrdiff_t offset = dst - coding->destination;
1059 if (BUFFERP (coding->dst_object))
1061 struct buffer *buf = XBUFFER (coding->dst_object);
1063 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1065 else
1066 coding_alloc_by_realloc (coding, nbytes);
1067 coding_set_destination (coding);
1068 dst = coding->destination + offset;
1069 return dst;
1072 /** Macros for annotations. */
1074 /* An annotation data is stored in the array coding->charbuf in this
1075 format:
1076 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077 LENGTH is the number of elements in the annotation.
1078 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079 NCHARS is the number of characters in the text annotated.
1081 The format of the following elements depend on ANNOTATION_MASK.
1083 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084 follows:
1085 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1087 NBYTES is the number of bytes specified in the header part of
1088 old-style emacs-mule encoding, or 0 for the other kind of
1089 composition.
1091 METHOD is one of enum composition_method.
1093 Optional COMPOSITION-COMPONENTS are characters and composition
1094 rules.
1096 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097 follows.
1099 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100 recover from an invalid annotation, and should be skipped by
1101 produce_annotation. */
1103 /* Maximum length of the header of annotation data. */
1104 #define MAX_ANNOTATION_LENGTH 5
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1107 do { \
1108 *(buf)++ = -(len); \
1109 *(buf)++ = (mask); \
1110 *(buf)++ = (nchars); \
1111 coding->annotated = 1; \
1112 } while (0);
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1115 do { \
1116 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117 *buf++ = nbytes; \
1118 *buf++ = method; \
1119 } while (0)
1122 #define ADD_CHARSET_DATA(buf, nchars, id) \
1123 do { \
1124 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125 *buf++ = id; \
1126 } while (0)
1129 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1134 /*** 3. UTF-8 ***/
1136 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1137 Return true if a text is encoded in UTF-8. */
1139 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1146 #define UTF_8_BOM_1 0xEF
1147 #define UTF_8_BOM_2 0xBB
1148 #define UTF_8_BOM_3 0xBF
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152 struct coding_detection_info *detect_info)
1154 const unsigned char *src = coding->source, *src_base;
1155 const unsigned char *src_end = coding->source + coding->src_bytes;
1156 bool multibytep = coding->src_multibyte;
1157 ptrdiff_t consumed_chars = 0;
1158 bool bom_found = 0;
1159 bool found = 0;
1161 detect_info->checked |= CATEGORY_MASK_UTF_8;
1162 /* A coding system of this category is always ASCII compatible. */
1163 src += coding->head_ascii;
1165 while (1)
1167 int c, c1, c2, c3, c4;
1169 src_base = src;
1170 ONE_MORE_BYTE (c);
1171 if (c < 0 || UTF_8_1_OCTET_P (c))
1172 continue;
1173 ONE_MORE_BYTE (c1);
1174 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175 break;
1176 if (UTF_8_2_OCTET_LEADING_P (c))
1178 found = 1;
1179 continue;
1181 ONE_MORE_BYTE (c2);
1182 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183 break;
1184 if (UTF_8_3_OCTET_LEADING_P (c))
1186 found = 1;
1187 if (src_base == coding->source
1188 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189 bom_found = 1;
1190 continue;
1192 ONE_MORE_BYTE (c3);
1193 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1194 break;
1195 if (UTF_8_4_OCTET_LEADING_P (c))
1197 found = 1;
1198 continue;
1200 ONE_MORE_BYTE (c4);
1201 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1202 break;
1203 if (UTF_8_5_OCTET_LEADING_P (c))
1205 found = 1;
1206 continue;
1208 break;
1210 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1211 return 0;
1213 no_more_source:
1214 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1216 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1217 return 0;
1219 if (bom_found)
1221 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1222 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1224 else
1226 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227 if (found)
1228 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1230 return 1;
1234 static void
1235 decode_coding_utf_8 (struct coding_system *coding)
1237 const unsigned char *src = coding->source + coding->consumed;
1238 const unsigned char *src_end = coding->source + coding->src_bytes;
1239 const unsigned char *src_base;
1240 int *charbuf = coding->charbuf + coding->charbuf_used;
1241 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1242 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1243 bool multibytep = coding->src_multibyte;
1244 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1245 bool eol_dos
1246 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1247 int byte_after_cr = -1;
1249 if (bom != utf_without_bom)
1251 int c1, c2, c3;
1253 src_base = src;
1254 ONE_MORE_BYTE (c1);
1255 if (! UTF_8_3_OCTET_LEADING_P (c1))
1256 src = src_base;
1257 else
1259 ONE_MORE_BYTE (c2);
1260 if (! UTF_8_EXTRA_OCTET_P (c2))
1261 src = src_base;
1262 else
1264 ONE_MORE_BYTE (c3);
1265 if (! UTF_8_EXTRA_OCTET_P (c3))
1266 src = src_base;
1267 else
1269 if ((c1 != UTF_8_BOM_1)
1270 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1271 src = src_base;
1272 else
1273 CODING_UTF_8_BOM (coding) = utf_without_bom;
1278 CODING_UTF_8_BOM (coding) = utf_without_bom;
1280 while (1)
1282 int c, c1, c2, c3, c4, c5;
1284 src_base = src;
1285 consumed_chars_base = consumed_chars;
1287 if (charbuf >= charbuf_end)
1289 if (byte_after_cr >= 0)
1290 src_base--;
1291 break;
1294 if (byte_after_cr >= 0)
1295 c1 = byte_after_cr, byte_after_cr = -1;
1296 else
1297 ONE_MORE_BYTE (c1);
1298 if (c1 < 0)
1300 c = - c1;
1302 else if (UTF_8_1_OCTET_P (c1))
1304 if (eol_dos && c1 == '\r')
1305 ONE_MORE_BYTE (byte_after_cr);
1306 c = c1;
1308 else
1310 ONE_MORE_BYTE (c2);
1311 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1312 goto invalid_code;
1313 if (UTF_8_2_OCTET_LEADING_P (c1))
1315 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1316 /* Reject overlong sequences here and below. Encoders
1317 producing them are incorrect, they can be misleading,
1318 and they mess up read/write invariance. */
1319 if (c < 128)
1320 goto invalid_code;
1322 else
1324 ONE_MORE_BYTE (c3);
1325 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1326 goto invalid_code;
1327 if (UTF_8_3_OCTET_LEADING_P (c1))
1329 c = (((c1 & 0xF) << 12)
1330 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1331 if (c < 0x800
1332 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1333 goto invalid_code;
1335 else
1337 ONE_MORE_BYTE (c4);
1338 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1339 goto invalid_code;
1340 if (UTF_8_4_OCTET_LEADING_P (c1))
1342 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1343 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1344 if (c < 0x10000)
1345 goto invalid_code;
1347 else
1349 ONE_MORE_BYTE (c5);
1350 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1351 goto invalid_code;
1352 if (UTF_8_5_OCTET_LEADING_P (c1))
1354 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1355 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1356 | (c5 & 0x3F));
1357 if ((c > MAX_CHAR) || (c < 0x200000))
1358 goto invalid_code;
1360 else
1361 goto invalid_code;
1367 *charbuf++ = c;
1368 continue;
1370 invalid_code:
1371 src = src_base;
1372 consumed_chars = consumed_chars_base;
1373 ONE_MORE_BYTE (c);
1374 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1375 coding->errors++;
1378 no_more_source:
1379 coding->consumed_char += consumed_chars_base;
1380 coding->consumed = src_base - coding->source;
1381 coding->charbuf_used = charbuf - coding->charbuf;
1385 static bool
1386 encode_coding_utf_8 (struct coding_system *coding)
1388 bool multibytep = coding->dst_multibyte;
1389 int *charbuf = coding->charbuf;
1390 int *charbuf_end = charbuf + coding->charbuf_used;
1391 unsigned char *dst = coding->destination + coding->produced;
1392 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1393 ptrdiff_t produced_chars = 0;
1394 int c;
1396 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1398 ASSURE_DESTINATION (3);
1399 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1400 CODING_UTF_8_BOM (coding) = utf_without_bom;
1403 if (multibytep)
1405 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1407 while (charbuf < charbuf_end)
1409 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1411 ASSURE_DESTINATION (safe_room);
1412 c = *charbuf++;
1413 if (CHAR_BYTE8_P (c))
1415 c = CHAR_TO_BYTE8 (c);
1416 EMIT_ONE_BYTE (c);
1418 else
1420 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1421 for (p = str; p < pend; p++)
1422 EMIT_ONE_BYTE (*p);
1426 else
1428 int safe_room = MAX_MULTIBYTE_LENGTH;
1430 while (charbuf < charbuf_end)
1432 ASSURE_DESTINATION (safe_room);
1433 c = *charbuf++;
1434 if (CHAR_BYTE8_P (c))
1435 *dst++ = CHAR_TO_BYTE8 (c);
1436 else
1437 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1438 produced_chars++;
1441 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1442 coding->produced_char += produced_chars;
1443 coding->produced = dst - coding->destination;
1444 return 0;
1448 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1449 Return true if a text is encoded in one of UTF-16 based coding systems. */
1451 #define UTF_16_HIGH_SURROGATE_P(val) \
1452 (((val) & 0xFC00) == 0xD800)
1454 #define UTF_16_LOW_SURROGATE_P(val) \
1455 (((val) & 0xFC00) == 0xDC00)
1458 static bool
1459 detect_coding_utf_16 (struct coding_system *coding,
1460 struct coding_detection_info *detect_info)
1462 const unsigned char *src = coding->source;
1463 const unsigned char *src_end = coding->source + coding->src_bytes;
1464 bool multibytep = coding->src_multibyte;
1465 int c1, c2;
1467 detect_info->checked |= CATEGORY_MASK_UTF_16;
1468 if (coding->mode & CODING_MODE_LAST_BLOCK
1469 && (coding->src_chars & 1))
1471 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1472 return 0;
1475 TWO_MORE_BYTES (c1, c2);
1476 if ((c1 == 0xFF) && (c2 == 0xFE))
1478 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1479 | CATEGORY_MASK_UTF_16_AUTO);
1480 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1481 | CATEGORY_MASK_UTF_16_BE_NOSIG
1482 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1484 else if ((c1 == 0xFE) && (c2 == 0xFF))
1486 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1487 | CATEGORY_MASK_UTF_16_AUTO);
1488 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1489 | CATEGORY_MASK_UTF_16_BE_NOSIG
1490 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1492 else if (c2 < 0)
1494 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495 return 0;
1497 else
1499 /* We check the dispersion of Eth and Oth bytes where E is even and
1500 O is odd. If both are high, we assume binary data.*/
1501 unsigned char e[256], o[256];
1502 unsigned e_num = 1, o_num = 1;
1504 memset (e, 0, 256);
1505 memset (o, 0, 256);
1506 e[c1] = 1;
1507 o[c2] = 1;
1509 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1510 |CATEGORY_MASK_UTF_16_BE
1511 | CATEGORY_MASK_UTF_16_LE);
1513 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1514 != CATEGORY_MASK_UTF_16)
1516 TWO_MORE_BYTES (c1, c2);
1517 if (c2 < 0)
1518 break;
1519 if (! e[c1])
1521 e[c1] = 1;
1522 e_num++;
1523 if (e_num >= 128)
1524 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1526 if (! o[c2])
1528 o[c2] = 1;
1529 o_num++;
1530 if (o_num >= 128)
1531 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1534 return 0;
1537 no_more_source:
1538 return 1;
1541 static void
1542 decode_coding_utf_16 (struct coding_system *coding)
1544 const unsigned char *src = coding->source + coding->consumed;
1545 const unsigned char *src_end = coding->source + coding->src_bytes;
1546 const unsigned char *src_base;
1547 int *charbuf = coding->charbuf + coding->charbuf_used;
1548 /* We may produces at most 3 chars in one loop. */
1549 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1550 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1551 bool multibytep = coding->src_multibyte;
1552 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1553 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1554 int surrogate = CODING_UTF_16_SURROGATE (coding);
1555 bool eol_dos
1556 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1557 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1559 if (bom == utf_with_bom)
1561 int c, c1, c2;
1563 src_base = src;
1564 ONE_MORE_BYTE (c1);
1565 ONE_MORE_BYTE (c2);
1566 c = (c1 << 8) | c2;
1568 if (endian == utf_16_big_endian
1569 ? c != 0xFEFF : c != 0xFFFE)
1571 /* The first two bytes are not BOM. Treat them as bytes
1572 for a normal character. */
1573 src = src_base;
1574 coding->errors++;
1576 CODING_UTF_16_BOM (coding) = utf_without_bom;
1578 else if (bom == utf_detect_bom)
1580 /* We have already tried to detect BOM and failed in
1581 detect_coding. */
1582 CODING_UTF_16_BOM (coding) = utf_without_bom;
1585 while (1)
1587 int c, c1, c2;
1589 src_base = src;
1590 consumed_chars_base = consumed_chars;
1592 if (charbuf >= charbuf_end)
1594 if (byte_after_cr1 >= 0)
1595 src_base -= 2;
1596 break;
1599 if (byte_after_cr1 >= 0)
1600 c1 = byte_after_cr1, byte_after_cr1 = -1;
1601 else
1602 ONE_MORE_BYTE (c1);
1603 if (c1 < 0)
1605 *charbuf++ = -c1;
1606 continue;
1608 if (byte_after_cr2 >= 0)
1609 c2 = byte_after_cr2, byte_after_cr2 = -1;
1610 else
1611 ONE_MORE_BYTE (c2);
1612 if (c2 < 0)
1614 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1615 *charbuf++ = -c2;
1616 continue;
1618 c = (endian == utf_16_big_endian
1619 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1621 if (surrogate)
1623 if (! UTF_16_LOW_SURROGATE_P (c))
1625 if (endian == utf_16_big_endian)
1626 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1627 else
1628 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1629 *charbuf++ = c1;
1630 *charbuf++ = c2;
1631 coding->errors++;
1632 if (UTF_16_HIGH_SURROGATE_P (c))
1633 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1634 else
1635 *charbuf++ = c;
1637 else
1639 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1640 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1641 *charbuf++ = 0x10000 + c;
1644 else
1646 if (UTF_16_HIGH_SURROGATE_P (c))
1647 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1648 else
1650 if (eol_dos && c == '\r')
1652 ONE_MORE_BYTE (byte_after_cr1);
1653 ONE_MORE_BYTE (byte_after_cr2);
1655 *charbuf++ = c;
1660 no_more_source:
1661 coding->consumed_char += consumed_chars_base;
1662 coding->consumed = src_base - coding->source;
1663 coding->charbuf_used = charbuf - coding->charbuf;
1666 static bool
1667 encode_coding_utf_16 (struct coding_system *coding)
1669 bool multibytep = coding->dst_multibyte;
1670 int *charbuf = coding->charbuf;
1671 int *charbuf_end = charbuf + coding->charbuf_used;
1672 unsigned char *dst = coding->destination + coding->produced;
1673 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1674 int safe_room = 8;
1675 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1676 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1677 ptrdiff_t produced_chars = 0;
1678 int c;
1680 if (bom != utf_without_bom)
1682 ASSURE_DESTINATION (safe_room);
1683 if (big_endian)
1684 EMIT_TWO_BYTES (0xFE, 0xFF);
1685 else
1686 EMIT_TWO_BYTES (0xFF, 0xFE);
1687 CODING_UTF_16_BOM (coding) = utf_without_bom;
1690 while (charbuf < charbuf_end)
1692 ASSURE_DESTINATION (safe_room);
1693 c = *charbuf++;
1694 if (c > MAX_UNICODE_CHAR)
1695 c = coding->default_char;
1697 if (c < 0x10000)
1699 if (big_endian)
1700 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1701 else
1702 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1704 else
1706 int c1, c2;
1708 c -= 0x10000;
1709 c1 = (c >> 10) + 0xD800;
1710 c2 = (c & 0x3FF) + 0xDC00;
1711 if (big_endian)
1712 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1713 else
1714 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1717 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1718 coding->produced = dst - coding->destination;
1719 coding->produced_char += produced_chars;
1720 return 0;
1724 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1726 /* Emacs' internal format for representation of multiple character
1727 sets is a kind of multi-byte encoding, i.e. characters are
1728 represented by variable-length sequences of one-byte codes.
1730 ASCII characters and control characters (e.g. `tab', `newline') are
1731 represented by one-byte sequences which are their ASCII codes, in
1732 the range 0x00 through 0x7F.
1734 8-bit characters of the range 0x80..0x9F are represented by
1735 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1736 code + 0x20).
1738 8-bit characters of the range 0xA0..0xFF are represented by
1739 one-byte sequences which are their 8-bit code.
1741 The other characters are represented by a sequence of `base
1742 leading-code', optional `extended leading-code', and one or two
1743 `position-code's. The length of the sequence is determined by the
1744 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1745 whereas extended leading-code and position-code take the range 0xA0
1746 through 0xFF. See `charset.h' for more details about leading-code
1747 and position-code.
1749 --- CODE RANGE of Emacs' internal format ---
1750 character set range
1751 ------------- -----
1752 ascii 0x00..0x7F
1753 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1754 eight-bit-graphic 0xA0..0xBF
1755 ELSE 0x81..0x9D + [0xA0..0xFF]+
1756 ---------------------------------------------
1758 As this is the internal character representation, the format is
1759 usually not used externally (i.e. in a file or in a data sent to a
1760 process). But, it is possible to have a text externally in this
1761 format (i.e. by encoding by the coding system `emacs-mule').
1763 In that case, a sequence of one-byte codes has a slightly different
1764 form.
1766 At first, all characters in eight-bit-control are represented by
1767 one-byte sequences which are their 8-bit code.
1769 Next, character composition data are represented by the byte
1770 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1771 where,
1772 METHOD is 0xF2 plus one of composition method (enum
1773 composition_method),
1775 BYTES is 0xA0 plus a byte length of this composition data,
1777 CHARS is 0xA0 plus a number of characters composed by this
1778 data,
1780 COMPONENTs are characters of multibyte form or composition
1781 rules encoded by two-byte of ASCII codes.
1783 In addition, for backward compatibility, the following formats are
1784 also recognized as composition data on decoding.
1786 0x80 MSEQ ...
1787 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1789 Here,
1790 MSEQ is a multibyte form but in these special format:
1791 ASCII: 0xA0 ASCII_CODE+0x80,
1792 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1793 RULE is a one byte code of the range 0xA0..0xF0 that
1794 represents a composition rule.
1797 char emacs_mule_bytes[256];
1800 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1801 Return true if a text is encoded in 'emacs-mule'. */
1803 static bool
1804 detect_coding_emacs_mule (struct coding_system *coding,
1805 struct coding_detection_info *detect_info)
1807 const unsigned char *src = coding->source, *src_base;
1808 const unsigned char *src_end = coding->source + coding->src_bytes;
1809 bool multibytep = coding->src_multibyte;
1810 ptrdiff_t consumed_chars = 0;
1811 int c;
1812 int found = 0;
1814 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1815 /* A coding system of this category is always ASCII compatible. */
1816 src += coding->head_ascii;
1818 while (1)
1820 src_base = src;
1821 ONE_MORE_BYTE (c);
1822 if (c < 0)
1823 continue;
1824 if (c == 0x80)
1826 /* Perhaps the start of composite character. We simply skip
1827 it because analyzing it is too heavy for detecting. But,
1828 at least, we check that the composite character
1829 constitutes of more than 4 bytes. */
1830 const unsigned char *src_start;
1832 repeat:
1833 src_start = src;
1836 ONE_MORE_BYTE (c);
1838 while (c >= 0xA0);
1840 if (src - src_start <= 4)
1841 break;
1842 found = CATEGORY_MASK_EMACS_MULE;
1843 if (c == 0x80)
1844 goto repeat;
1847 if (c < 0x80)
1849 if (c < 0x20
1850 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1851 break;
1853 else
1855 int more_bytes = emacs_mule_bytes[c] - 1;
1857 while (more_bytes > 0)
1859 ONE_MORE_BYTE (c);
1860 if (c < 0xA0)
1862 src--; /* Unread the last byte. */
1863 break;
1865 more_bytes--;
1867 if (more_bytes != 0)
1868 break;
1869 found = CATEGORY_MASK_EMACS_MULE;
1872 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1873 return 0;
1875 no_more_source:
1876 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1878 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879 return 0;
1881 detect_info->found |= found;
1882 return 1;
1886 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1887 character. If CMP_STATUS indicates that we must expect MSEQ or
1888 RULE described above, decode it and return the negative value of
1889 the decoded character or rule. If an invalid byte is found, return
1890 -1. If SRC is too short, return -2. */
1892 static int
1893 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1894 int *nbytes, int *nchars, int *id,
1895 struct composition_status *cmp_status)
1897 const unsigned char *src_end = coding->source + coding->src_bytes;
1898 const unsigned char *src_base = src;
1899 bool multibytep = coding->src_multibyte;
1900 int charset_ID;
1901 unsigned code;
1902 int c;
1903 int consumed_chars = 0;
1904 bool mseq_found = 0;
1906 ONE_MORE_BYTE (c);
1907 if (c < 0)
1909 c = -c;
1910 charset_ID = emacs_mule_charset[0];
1912 else
1914 if (c >= 0xA0)
1916 if (cmp_status->state != COMPOSING_NO
1917 && cmp_status->old_form)
1919 if (cmp_status->state == COMPOSING_CHAR)
1921 if (c == 0xA0)
1923 ONE_MORE_BYTE (c);
1924 c -= 0x80;
1925 if (c < 0)
1926 goto invalid_code;
1928 else
1929 c -= 0x20;
1930 mseq_found = 1;
1932 else
1934 *nbytes = src - src_base;
1935 *nchars = consumed_chars;
1936 return -c;
1939 else
1940 goto invalid_code;
1943 switch (emacs_mule_bytes[c])
1945 case 2:
1946 if ((charset_ID = emacs_mule_charset[c]) < 0)
1947 goto invalid_code;
1948 ONE_MORE_BYTE (c);
1949 if (c < 0xA0)
1950 goto invalid_code;
1951 code = c & 0x7F;
1952 break;
1954 case 3:
1955 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1956 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1958 ONE_MORE_BYTE (c);
1959 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1960 goto invalid_code;
1961 ONE_MORE_BYTE (c);
1962 if (c < 0xA0)
1963 goto invalid_code;
1964 code = c & 0x7F;
1966 else
1968 if ((charset_ID = emacs_mule_charset[c]) < 0)
1969 goto invalid_code;
1970 ONE_MORE_BYTE (c);
1971 if (c < 0xA0)
1972 goto invalid_code;
1973 code = (c & 0x7F) << 8;
1974 ONE_MORE_BYTE (c);
1975 if (c < 0xA0)
1976 goto invalid_code;
1977 code |= c & 0x7F;
1979 break;
1981 case 4:
1982 ONE_MORE_BYTE (c);
1983 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1984 goto invalid_code;
1985 ONE_MORE_BYTE (c);
1986 if (c < 0xA0)
1987 goto invalid_code;
1988 code = (c & 0x7F) << 8;
1989 ONE_MORE_BYTE (c);
1990 if (c < 0xA0)
1991 goto invalid_code;
1992 code |= c & 0x7F;
1993 break;
1995 case 1:
1996 code = c;
1997 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1998 break;
2000 default:
2001 emacs_abort ();
2003 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2004 CHARSET_FROM_ID (charset_ID), code, c);
2005 if (c < 0)
2006 goto invalid_code;
2008 *nbytes = src - src_base;
2009 *nchars = consumed_chars;
2010 if (id)
2011 *id = charset_ID;
2012 return (mseq_found ? -c : c);
2014 no_more_source:
2015 return -2;
2017 invalid_code:
2018 return -1;
2022 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2024 /* Handle these composition sequence ('|': the end of header elements,
2025 BYTES and CHARS >= 0xA0):
2027 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2028 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2029 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2031 and these old form:
2033 (4) relative composition: 0x80 | MSEQ ... MSEQ
2034 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2036 When the starter 0x80 and the following header elements are found,
2037 this annotation header is produced.
2039 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2041 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2042 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2044 Then, upon reading the following elements, these codes are produced
2045 until the composition end is found:
2047 (1) CHAR ... CHAR
2048 (2) ALT ... ALT CHAR ... CHAR
2049 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2050 (4) CHAR ... CHAR
2051 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2053 When the composition end is found, LENGTH and NCHARS in the
2054 annotation header is updated as below:
2056 (1) LENGTH: unchanged, NCHARS: unchanged
2057 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2058 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2059 (4) LENGTH: unchanged, NCHARS: number of CHARs
2060 (5) LENGTH: unchanged, NCHARS: number of CHARs
2062 If an error is found while composing, the annotation header is
2063 changed to the original composition header (plus filler -1s) as
2064 below:
2066 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2067 (5) [ 0x80 0xFF -1 -1- -1 ]
2069 and the sequence [ -2 DECODED-RULE ] is changed to the original
2070 byte sequence as below:
2071 o the original byte sequence is B: [ B -1 ]
2072 o the original byte sequence is B1 B2: [ B1 B2 ]
2074 Most of the routines are implemented by macros because many
2075 variables and labels in the caller decode_coding_emacs_mule must be
2076 accessible, and they are usually called just once (thus doesn't
2077 increase the size of compiled object). */
2079 /* Decode a composition rule represented by C as a component of
2080 composition sequence of Emacs 20 style. Set RULE to the decoded
2081 rule. */
2083 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2084 do { \
2085 int gref, nref; \
2087 c -= 0xA0; \
2088 if (c < 0 || c >= 81) \
2089 goto invalid_code; \
2090 gref = c / 9, nref = c % 9; \
2091 if (gref == 4) gref = 10; \
2092 if (nref == 4) nref = 10; \
2093 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2094 } while (0)
2097 /* Decode a composition rule represented by C and the following byte
2098 at SRC as a component of composition sequence of Emacs 21 style.
2099 Set RULE to the decoded rule. */
2101 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2102 do { \
2103 int gref, nref; \
2105 gref = c - 0x20; \
2106 if (gref < 0 || gref >= 81) \
2107 goto invalid_code; \
2108 ONE_MORE_BYTE (c); \
2109 nref = c - 0x20; \
2110 if (nref < 0 || nref >= 81) \
2111 goto invalid_code; \
2112 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2113 } while (0)
2116 /* Start of Emacs 21 style format. The first three bytes at SRC are
2117 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2118 byte length of this composition information, CHARS is the number of
2119 characters composed by this composition. */
2121 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2122 do { \
2123 enum composition_method method = c - 0xF2; \
2124 int nbytes, nchars; \
2126 ONE_MORE_BYTE (c); \
2127 if (c < 0) \
2128 goto invalid_code; \
2129 nbytes = c - 0xA0; \
2130 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2131 goto invalid_code; \
2132 ONE_MORE_BYTE (c); \
2133 nchars = c - 0xA0; \
2134 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2135 goto invalid_code; \
2136 cmp_status->old_form = 0; \
2137 cmp_status->method = method; \
2138 if (method == COMPOSITION_RELATIVE) \
2139 cmp_status->state = COMPOSING_CHAR; \
2140 else \
2141 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2142 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2143 cmp_status->nchars = nchars; \
2144 cmp_status->ncomps = nbytes - 4; \
2145 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2146 } while (0)
2149 /* Start of Emacs 20 style format for relative composition. */
2151 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2152 do { \
2153 cmp_status->old_form = 1; \
2154 cmp_status->method = COMPOSITION_RELATIVE; \
2155 cmp_status->state = COMPOSING_CHAR; \
2156 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2157 cmp_status->nchars = cmp_status->ncomps = 0; \
2158 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2159 } while (0)
2162 /* Start of Emacs 20 style format for rule-base composition. */
2164 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2165 do { \
2166 cmp_status->old_form = 1; \
2167 cmp_status->method = COMPOSITION_WITH_RULE; \
2168 cmp_status->state = COMPOSING_CHAR; \
2169 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2170 cmp_status->nchars = cmp_status->ncomps = 0; \
2171 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2172 } while (0)
2175 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2176 do { \
2177 const unsigned char *current_src = src; \
2179 ONE_MORE_BYTE (c); \
2180 if (c < 0) \
2181 goto invalid_code; \
2182 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2183 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2184 DECODE_EMACS_MULE_21_COMPOSITION (); \
2185 else if (c < 0xA0) \
2186 goto invalid_code; \
2187 else if (c < 0xC0) \
2189 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2190 /* Re-read C as a composition component. */ \
2191 src = current_src; \
2193 else if (c == 0xFF) \
2194 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2195 else \
2196 goto invalid_code; \
2197 } while (0)
2199 #define EMACS_MULE_COMPOSITION_END() \
2200 do { \
2201 int idx = - cmp_status->length; \
2203 if (cmp_status->old_form) \
2204 charbuf[idx + 2] = cmp_status->nchars; \
2205 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2206 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2207 cmp_status->state = COMPOSING_NO; \
2208 } while (0)
2211 static int
2212 emacs_mule_finish_composition (int *charbuf,
2213 struct composition_status *cmp_status)
2215 int idx = - cmp_status->length;
2216 int new_chars;
2218 if (cmp_status->old_form && cmp_status->nchars > 0)
2220 charbuf[idx + 2] = cmp_status->nchars;
2221 new_chars = 0;
2222 if (cmp_status->method == COMPOSITION_WITH_RULE
2223 && cmp_status->state == COMPOSING_CHAR)
2225 /* The last rule was invalid. */
2226 int rule = charbuf[-1] + 0xA0;
2228 charbuf[-2] = BYTE8_TO_CHAR (rule);
2229 charbuf[-1] = -1;
2230 new_chars = 1;
2233 else
2235 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2237 if (cmp_status->method == COMPOSITION_WITH_RULE)
2239 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2240 charbuf[idx++] = -3;
2241 charbuf[idx++] = 0;
2242 new_chars = 1;
2244 else
2246 int nchars = charbuf[idx + 1] + 0xA0;
2247 int nbytes = charbuf[idx + 2] + 0xA0;
2249 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2250 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2251 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2252 charbuf[idx++] = -1;
2253 new_chars = 4;
2256 cmp_status->state = COMPOSING_NO;
2257 return new_chars;
2260 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2261 do { \
2262 if (cmp_status->state != COMPOSING_NO) \
2263 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2264 } while (0)
2267 static void
2268 decode_coding_emacs_mule (struct coding_system *coding)
2270 const unsigned char *src = coding->source + coding->consumed;
2271 const unsigned char *src_end = coding->source + coding->src_bytes;
2272 const unsigned char *src_base;
2273 int *charbuf = coding->charbuf + coding->charbuf_used;
2274 /* We may produce two annotations (charset and composition) in one
2275 loop and one more charset annotation at the end. */
2276 int *charbuf_end
2277 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2278 /* We can produce up to 2 characters in a loop. */
2279 - 1;
2280 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2281 bool multibytep = coding->src_multibyte;
2282 ptrdiff_t char_offset = coding->produced_char;
2283 ptrdiff_t last_offset = char_offset;
2284 int last_id = charset_ascii;
2285 bool eol_dos
2286 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2287 int byte_after_cr = -1;
2288 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2290 if (cmp_status->state != COMPOSING_NO)
2292 int i;
2294 if (charbuf_end - charbuf < cmp_status->length)
2295 emacs_abort ();
2296 for (i = 0; i < cmp_status->length; i++)
2297 *charbuf++ = cmp_status->carryover[i];
2298 coding->annotated = 1;
2301 while (1)
2303 int c, id IF_LINT (= 0);
2305 src_base = src;
2306 consumed_chars_base = consumed_chars;
2308 if (charbuf >= charbuf_end)
2310 if (byte_after_cr >= 0)
2311 src_base--;
2312 break;
2315 if (byte_after_cr >= 0)
2316 c = byte_after_cr, byte_after_cr = -1;
2317 else
2318 ONE_MORE_BYTE (c);
2320 if (c < 0 || c == 0x80)
2322 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2323 if (c < 0)
2325 *charbuf++ = -c;
2326 char_offset++;
2328 else
2329 DECODE_EMACS_MULE_COMPOSITION_START ();
2330 continue;
2333 if (c < 0x80)
2335 if (eol_dos && c == '\r')
2336 ONE_MORE_BYTE (byte_after_cr);
2337 id = charset_ascii;
2338 if (cmp_status->state != COMPOSING_NO)
2340 if (cmp_status->old_form)
2341 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2342 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2343 cmp_status->ncomps--;
2346 else
2348 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2349 /* emacs_mule_char can load a charset map from a file, which
2350 allocates a large structure and might cause buffer text
2351 to be relocated as result. Thus, we need to remember the
2352 original pointer to buffer text, and fix up all related
2353 pointers after the call. */
2354 const unsigned char *orig = coding->source;
2355 ptrdiff_t offset;
2357 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2358 cmp_status);
2359 offset = coding->source - orig;
2360 if (offset)
2362 src += offset;
2363 src_base += offset;
2364 src_end += offset;
2366 if (c < 0)
2368 if (c == -1)
2369 goto invalid_code;
2370 if (c == -2)
2371 break;
2373 src = src_base + nbytes;
2374 consumed_chars = consumed_chars_base + nchars;
2375 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2376 cmp_status->ncomps -= nchars;
2379 /* Now if C >= 0, we found a normally encoded character, if C <
2380 0, we found an old-style composition component character or
2381 rule. */
2383 if (cmp_status->state == COMPOSING_NO)
2385 if (last_id != id)
2387 if (last_id != charset_ascii)
2388 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2389 last_id);
2390 last_id = id;
2391 last_offset = char_offset;
2393 *charbuf++ = c;
2394 char_offset++;
2396 else if (cmp_status->state == COMPOSING_CHAR)
2398 if (cmp_status->old_form)
2400 if (c >= 0)
2402 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403 *charbuf++ = c;
2404 char_offset++;
2406 else
2408 *charbuf++ = -c;
2409 cmp_status->nchars++;
2410 cmp_status->length++;
2411 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2412 EMACS_MULE_COMPOSITION_END ();
2413 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2414 cmp_status->state = COMPOSING_RULE;
2417 else
2419 *charbuf++ = c;
2420 cmp_status->length++;
2421 cmp_status->nchars--;
2422 if (cmp_status->nchars == 0)
2423 EMACS_MULE_COMPOSITION_END ();
2426 else if (cmp_status->state == COMPOSING_RULE)
2428 int rule;
2430 if (c >= 0)
2432 EMACS_MULE_COMPOSITION_END ();
2433 *charbuf++ = c;
2434 char_offset++;
2436 else
2438 c = -c;
2439 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2440 if (rule < 0)
2441 goto invalid_code;
2442 *charbuf++ = -2;
2443 *charbuf++ = rule;
2444 cmp_status->length += 2;
2445 cmp_status->state = COMPOSING_CHAR;
2448 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2450 *charbuf++ = c;
2451 cmp_status->length++;
2452 if (cmp_status->ncomps == 0)
2453 cmp_status->state = COMPOSING_CHAR;
2454 else if (cmp_status->ncomps > 0)
2456 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2457 cmp_status->state = COMPOSING_COMPONENT_RULE;
2459 else
2460 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2462 else /* COMPOSING_COMPONENT_RULE */
2464 int rule;
2466 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2467 if (rule < 0)
2468 goto invalid_code;
2469 *charbuf++ = -2;
2470 *charbuf++ = rule;
2471 cmp_status->length += 2;
2472 cmp_status->ncomps--;
2473 if (cmp_status->ncomps > 0)
2474 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2475 else
2476 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2478 continue;
2480 invalid_code:
2481 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482 src = src_base;
2483 consumed_chars = consumed_chars_base;
2484 ONE_MORE_BYTE (c);
2485 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2486 char_offset++;
2487 coding->errors++;
2490 no_more_source:
2491 if (cmp_status->state != COMPOSING_NO)
2493 if (coding->mode & CODING_MODE_LAST_BLOCK)
2494 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495 else
2497 int i;
2499 charbuf -= cmp_status->length;
2500 for (i = 0; i < cmp_status->length; i++)
2501 cmp_status->carryover[i] = charbuf[i];
2504 if (last_id != charset_ascii)
2505 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2506 coding->consumed_char += consumed_chars_base;
2507 coding->consumed = src_base - coding->source;
2508 coding->charbuf_used = charbuf - coding->charbuf;
2512 #define EMACS_MULE_LEADING_CODES(id, codes) \
2513 do { \
2514 if (id < 0xA0) \
2515 codes[0] = id, codes[1] = 0; \
2516 else if (id < 0xE0) \
2517 codes[0] = 0x9A, codes[1] = id; \
2518 else if (id < 0xF0) \
2519 codes[0] = 0x9B, codes[1] = id; \
2520 else if (id < 0xF5) \
2521 codes[0] = 0x9C, codes[1] = id; \
2522 else \
2523 codes[0] = 0x9D, codes[1] = id; \
2524 } while (0);
2527 static bool
2528 encode_coding_emacs_mule (struct coding_system *coding)
2530 bool multibytep = coding->dst_multibyte;
2531 int *charbuf = coding->charbuf;
2532 int *charbuf_end = charbuf + coding->charbuf_used;
2533 unsigned char *dst = coding->destination + coding->produced;
2534 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2535 int safe_room = 8;
2536 ptrdiff_t produced_chars = 0;
2537 Lisp_Object attrs, charset_list;
2538 int c;
2539 int preferred_charset_id = -1;
2541 CODING_GET_INFO (coding, attrs, charset_list);
2542 if (! EQ (charset_list, Vemacs_mule_charset_list))
2544 charset_list = Vemacs_mule_charset_list;
2545 ASET (attrs, coding_attr_charset_list, charset_list);
2548 while (charbuf < charbuf_end)
2550 ASSURE_DESTINATION (safe_room);
2551 c = *charbuf++;
2553 if (c < 0)
2555 /* Handle an annotation. */
2556 switch (*charbuf)
2558 case CODING_ANNOTATE_COMPOSITION_MASK:
2559 /* Not yet implemented. */
2560 break;
2561 case CODING_ANNOTATE_CHARSET_MASK:
2562 preferred_charset_id = charbuf[3];
2563 if (preferred_charset_id >= 0
2564 && NILP (Fmemq (make_number (preferred_charset_id),
2565 charset_list)))
2566 preferred_charset_id = -1;
2567 break;
2568 default:
2569 emacs_abort ();
2571 charbuf += -c - 1;
2572 continue;
2575 if (ASCII_CHAR_P (c))
2576 EMIT_ONE_ASCII_BYTE (c);
2577 else if (CHAR_BYTE8_P (c))
2579 c = CHAR_TO_BYTE8 (c);
2580 EMIT_ONE_BYTE (c);
2582 else
2584 struct charset *charset;
2585 unsigned code;
2586 int dimension;
2587 int emacs_mule_id;
2588 unsigned char leading_codes[2];
2590 if (preferred_charset_id >= 0)
2592 bool result;
2594 charset = CHARSET_FROM_ID (preferred_charset_id);
2595 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2596 if (result)
2597 code = ENCODE_CHAR (charset, c);
2598 else
2599 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600 &code, charset);
2602 else
2603 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2604 &code, charset);
2605 if (! charset)
2607 c = coding->default_char;
2608 if (ASCII_CHAR_P (c))
2610 EMIT_ONE_ASCII_BYTE (c);
2611 continue;
2613 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614 &code, charset);
2616 dimension = CHARSET_DIMENSION (charset);
2617 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2618 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2619 EMIT_ONE_BYTE (leading_codes[0]);
2620 if (leading_codes[1])
2621 EMIT_ONE_BYTE (leading_codes[1]);
2622 if (dimension == 1)
2623 EMIT_ONE_BYTE (code | 0x80);
2624 else
2626 code |= 0x8080;
2627 EMIT_ONE_BYTE (code >> 8);
2628 EMIT_ONE_BYTE (code & 0xFF);
2632 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2633 coding->produced_char += produced_chars;
2634 coding->produced = dst - coding->destination;
2635 return 0;
2639 /*** 7. ISO2022 handlers ***/
2641 /* The following note describes the coding system ISO2022 briefly.
2642 Since the intention of this note is to help understand the
2643 functions in this file, some parts are NOT ACCURATE or are OVERLY
2644 SIMPLIFIED. For thorough understanding, please refer to the
2645 original document of ISO2022. This is equivalent to the standard
2646 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2648 ISO2022 provides many mechanisms to encode several character sets
2649 in 7-bit and 8-bit environments. For 7-bit environments, all text
2650 is encoded using bytes less than 128. This may make the encoded
2651 text a little bit longer, but the text passes more easily through
2652 several types of gateway, some of which strip off the MSB (Most
2653 Significant Bit).
2655 There are two kinds of character sets: control character sets and
2656 graphic character sets. The former contain control characters such
2657 as `newline' and `escape' to provide control functions (control
2658 functions are also provided by escape sequences). The latter
2659 contain graphic characters such as 'A' and '-'. Emacs recognizes
2660 two control character sets and many graphic character sets.
2662 Graphic character sets are classified into one of the following
2663 four classes, according to the number of bytes (DIMENSION) and
2664 number of characters in one dimension (CHARS) of the set:
2665 - DIMENSION1_CHARS94
2666 - DIMENSION1_CHARS96
2667 - DIMENSION2_CHARS94
2668 - DIMENSION2_CHARS96
2670 In addition, each character set is assigned an identification tag,
2671 unique for each set, called the "final character" (denoted as <F>
2672 hereafter). The <F> of each character set is decided by ECMA(*)
2673 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2674 (0x30..0x3F are for private use only).
2676 Note (*): ECMA = European Computer Manufacturers Association
2678 Here are examples of graphic character sets [NAME(<F>)]:
2679 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2680 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2681 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2682 o DIMENSION2_CHARS96 -- none for the moment
2684 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2685 C0 [0x00..0x1F] -- control character plane 0
2686 GL [0x20..0x7F] -- graphic character plane 0
2687 C1 [0x80..0x9F] -- control character plane 1
2688 GR [0xA0..0xFF] -- graphic character plane 1
2690 A control character set is directly designated and invoked to C0 or
2691 C1 by an escape sequence. The most common case is that:
2692 - ISO646's control character set is designated/invoked to C0, and
2693 - ISO6429's control character set is designated/invoked to C1,
2694 and usually these designations/invocations are omitted in encoded
2695 text. In a 7-bit environment, only C0 can be used, and a control
2696 character for C1 is encoded by an appropriate escape sequence to
2697 fit into the environment. All control characters for C1 are
2698 defined to have corresponding escape sequences.
2700 A graphic character set is at first designated to one of four
2701 graphic registers (G0 through G3), then these graphic registers are
2702 invoked to GL or GR. These designations and invocations can be
2703 done independently. The most common case is that G0 is invoked to
2704 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2705 these invocations and designations are omitted in encoded text.
2706 In a 7-bit environment, only GL can be used.
2708 When a graphic character set of CHARS94 is invoked to GL, codes
2709 0x20 and 0x7F of the GL area work as control characters SPACE and
2710 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2711 be used.
2713 There are two ways of invocation: locking-shift and single-shift.
2714 With locking-shift, the invocation lasts until the next different
2715 invocation, whereas with single-shift, the invocation affects the
2716 following character only and doesn't affect the locking-shift
2717 state. Invocations are done by the following control characters or
2718 escape sequences:
2720 ----------------------------------------------------------------------
2721 abbrev function cntrl escape seq description
2722 ----------------------------------------------------------------------
2723 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2724 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2725 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2726 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2727 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2728 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2729 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2730 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2731 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2732 ----------------------------------------------------------------------
2733 (*) These are not used by any known coding system.
2735 Control characters for these functions are defined by macros
2736 ISO_CODE_XXX in `coding.h'.
2738 Designations are done by the following escape sequences:
2739 ----------------------------------------------------------------------
2740 escape sequence description
2741 ----------------------------------------------------------------------
2742 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2743 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2744 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2745 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2746 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2747 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2748 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2749 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2750 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2751 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2752 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2753 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2754 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2755 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2756 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2757 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2758 ----------------------------------------------------------------------
2760 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2761 of dimension 1, chars 94, and final character <F>, etc...
2763 Note (*): Although these designations are not allowed in ISO2022,
2764 Emacs accepts them on decoding, and produces them on encoding
2765 CHARS96 character sets in a coding system which is characterized as
2766 7-bit environment, non-locking-shift, and non-single-shift.
2768 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2769 '(' must be omitted. We refer to this as "short-form" hereafter.
2771 Now you may notice that there are a lot of ways of encoding the
2772 same multilingual text in ISO2022. Actually, there exist many
2773 coding systems such as Compound Text (used in X11's inter client
2774 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2775 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2776 localized platforms), and all of these are variants of ISO2022.
2778 In addition to the above, Emacs handles two more kinds of escape
2779 sequences: ISO6429's direction specification and Emacs' private
2780 sequence for specifying character composition.
2782 ISO6429's direction specification takes the following form:
2783 o CSI ']' -- end of the current direction
2784 o CSI '0' ']' -- end of the current direction
2785 o CSI '1' ']' -- start of left-to-right text
2786 o CSI '2' ']' -- start of right-to-left text
2787 The control character CSI (0x9B: control sequence introducer) is
2788 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2790 Character composition specification takes the following form:
2791 o ESC '0' -- start relative composition
2792 o ESC '1' -- end composition
2793 o ESC '2' -- start rule-base composition (*)
2794 o ESC '3' -- start relative composition with alternate chars (**)
2795 o ESC '4' -- start rule-base composition with alternate chars (**)
2796 Since these are not standard escape sequences of any ISO standard,
2797 the use of them with these meanings is restricted to Emacs only.
2799 (*) This form is used only in Emacs 20.7 and older versions,
2800 but newer versions can safely decode it.
2801 (**) This form is used only in Emacs 21.1 and newer versions,
2802 and older versions can't decode it.
2804 Here's a list of example usages of these composition escape
2805 sequences (categorized by `enum composition_method').
2807 COMPOSITION_RELATIVE:
2808 ESC 0 CHAR [ CHAR ] ESC 1
2809 COMPOSITION_WITH_RULE:
2810 ESC 2 CHAR [ RULE CHAR ] ESC 1
2811 COMPOSITION_WITH_ALTCHARS:
2812 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2813 COMPOSITION_WITH_RULE_ALTCHARS:
2814 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2816 static enum iso_code_class_type iso_code_class[256];
2818 #define SAFE_CHARSET_P(coding, id) \
2819 ((id) <= (coding)->max_charset_id \
2820 && (coding)->safe_charsets[id] != 255)
2822 static void
2823 setup_iso_safe_charsets (Lisp_Object attrs)
2825 Lisp_Object charset_list, safe_charsets;
2826 Lisp_Object request;
2827 Lisp_Object reg_usage;
2828 Lisp_Object tail;
2829 EMACS_INT reg94, reg96;
2830 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2831 int max_charset_id;
2833 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2834 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2835 && ! EQ (charset_list, Viso_2022_charset_list))
2837 charset_list = Viso_2022_charset_list;
2838 ASET (attrs, coding_attr_charset_list, charset_list);
2839 ASET (attrs, coding_attr_safe_charsets, Qnil);
2842 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2843 return;
2845 max_charset_id = 0;
2846 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2848 int id = XINT (XCAR (tail));
2849 if (max_charset_id < id)
2850 max_charset_id = id;
2853 safe_charsets = make_uninit_string (max_charset_id + 1);
2854 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2855 request = AREF (attrs, coding_attr_iso_request);
2856 reg_usage = AREF (attrs, coding_attr_iso_usage);
2857 reg94 = XINT (XCAR (reg_usage));
2858 reg96 = XINT (XCDR (reg_usage));
2860 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2862 Lisp_Object id;
2863 Lisp_Object reg;
2864 struct charset *charset;
2866 id = XCAR (tail);
2867 charset = CHARSET_FROM_ID (XINT (id));
2868 reg = Fcdr (Fassq (id, request));
2869 if (! NILP (reg))
2870 SSET (safe_charsets, XINT (id), XINT (reg));
2871 else if (charset->iso_chars_96)
2873 if (reg96 < 4)
2874 SSET (safe_charsets, XINT (id), reg96);
2876 else
2878 if (reg94 < 4)
2879 SSET (safe_charsets, XINT (id), reg94);
2882 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2887 Return true if a text is encoded in one of ISO-2022 based coding
2888 systems. */
2890 static bool
2891 detect_coding_iso_2022 (struct coding_system *coding,
2892 struct coding_detection_info *detect_info)
2894 const unsigned char *src = coding->source, *src_base = src;
2895 const unsigned char *src_end = coding->source + coding->src_bytes;
2896 bool multibytep = coding->src_multibyte;
2897 bool single_shifting = 0;
2898 int id;
2899 int c, c1;
2900 ptrdiff_t consumed_chars = 0;
2901 int i;
2902 int rejected = 0;
2903 int found = 0;
2904 int composition_count = -1;
2906 detect_info->checked |= CATEGORY_MASK_ISO;
2908 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2910 struct coding_system *this = &(coding_categories[i]);
2911 Lisp_Object attrs, val;
2913 if (this->id < 0)
2914 continue;
2915 attrs = CODING_ID_ATTRS (this->id);
2916 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2917 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2918 setup_iso_safe_charsets (attrs);
2919 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2920 this->max_charset_id = SCHARS (val) - 1;
2921 this->safe_charsets = SDATA (val);
2924 /* A coding system of this category is always ASCII compatible. */
2925 src += coding->head_ascii;
2927 while (rejected != CATEGORY_MASK_ISO)
2929 src_base = src;
2930 ONE_MORE_BYTE (c);
2931 switch (c)
2933 case ISO_CODE_ESC:
2934 if (inhibit_iso_escape_detection)
2935 break;
2936 single_shifting = 0;
2937 ONE_MORE_BYTE (c);
2938 if (c == 'N' || c == 'O')
2940 /* ESC <Fe> for SS2 or SS3. */
2941 single_shifting = 1;
2942 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2944 else if (c == '1')
2946 /* End of composition. */
2947 if (composition_count < 0
2948 || composition_count > MAX_COMPOSITION_COMPONENTS)
2949 /* Invalid */
2950 break;
2951 composition_count = -1;
2952 found |= CATEGORY_MASK_ISO;
2954 else if (c >= '0' && c <= '4')
2956 /* ESC <Fp> for start/end composition. */
2957 composition_count = 0;
2959 else
2961 if (c >= '(' && c <= '/')
2963 /* Designation sequence for a charset of dimension 1. */
2964 ONE_MORE_BYTE (c1);
2965 if (c1 < ' ' || c1 >= 0x80
2966 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2967 /* Invalid designation sequence. Just ignore. */
2968 break;
2970 else if (c == '$')
2972 /* Designation sequence for a charset of dimension 2. */
2973 ONE_MORE_BYTE (c);
2974 if (c >= '@' && c <= 'B')
2975 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2976 id = iso_charset_table[1][0][c];
2977 else if (c >= '(' && c <= '/')
2979 ONE_MORE_BYTE (c1);
2980 if (c1 < ' ' || c1 >= 0x80
2981 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2982 /* Invalid designation sequence. Just ignore. */
2983 break;
2985 else
2986 /* Invalid designation sequence. Just ignore it. */
2987 break;
2989 else
2991 /* Invalid escape sequence. Just ignore it. */
2992 break;
2995 /* We found a valid designation sequence for CHARSET. */
2996 rejected |= CATEGORY_MASK_ISO_8BIT;
2997 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2998 id))
2999 found |= CATEGORY_MASK_ISO_7;
3000 else
3001 rejected |= CATEGORY_MASK_ISO_7;
3002 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3003 id))
3004 found |= CATEGORY_MASK_ISO_7_TIGHT;
3005 else
3006 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3007 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3008 id))
3009 found |= CATEGORY_MASK_ISO_7_ELSE;
3010 else
3011 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3012 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3013 id))
3014 found |= CATEGORY_MASK_ISO_8_ELSE;
3015 else
3016 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3018 break;
3020 case ISO_CODE_SO:
3021 case ISO_CODE_SI:
3022 /* Locking shift out/in. */
3023 if (inhibit_iso_escape_detection)
3024 break;
3025 single_shifting = 0;
3026 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3027 break;
3029 case ISO_CODE_CSI:
3030 /* Control sequence introducer. */
3031 single_shifting = 0;
3032 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3033 found |= CATEGORY_MASK_ISO_8_ELSE;
3034 goto check_extra_latin;
3036 case ISO_CODE_SS2:
3037 case ISO_CODE_SS3:
3038 /* Single shift. */
3039 if (inhibit_iso_escape_detection)
3040 break;
3041 single_shifting = 0;
3042 rejected |= CATEGORY_MASK_ISO_7BIT;
3043 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3044 & CODING_ISO_FLAG_SINGLE_SHIFT)
3046 found |= CATEGORY_MASK_ISO_8_1;
3047 single_shifting = 1;
3049 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3050 & CODING_ISO_FLAG_SINGLE_SHIFT)
3052 found |= CATEGORY_MASK_ISO_8_2;
3053 single_shifting = 1;
3055 if (single_shifting)
3056 break;
3057 goto check_extra_latin;
3059 default:
3060 if (c < 0)
3061 continue;
3062 if (c < 0x80)
3064 if (composition_count >= 0)
3065 composition_count++;
3066 single_shifting = 0;
3067 break;
3069 if (c >= 0xA0)
3071 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072 found |= CATEGORY_MASK_ISO_8_1;
3073 /* Check the length of succeeding codes of the range
3074 0xA0..0FF. If the byte length is even, we include
3075 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3076 only when we are not single shifting. */
3077 if (! single_shifting
3078 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3080 int len = 1;
3081 while (src < src_end)
3083 src_base = src;
3084 ONE_MORE_BYTE (c);
3085 if (c < 0xA0)
3087 src = src_base;
3088 break;
3090 len++;
3093 if (len & 1 && src < src_end)
3095 rejected |= CATEGORY_MASK_ISO_8_2;
3096 if (composition_count >= 0)
3097 composition_count += len;
3099 else
3101 found |= CATEGORY_MASK_ISO_8_2;
3102 if (composition_count >= 0)
3103 composition_count += len / 2;
3106 break;
3108 check_extra_latin:
3109 if (! VECTORP (Vlatin_extra_code_table)
3110 || NILP (AREF (Vlatin_extra_code_table, c)))
3112 rejected = CATEGORY_MASK_ISO;
3113 break;
3115 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116 & CODING_ISO_FLAG_LATIN_EXTRA)
3117 found |= CATEGORY_MASK_ISO_8_1;
3118 else
3119 rejected |= CATEGORY_MASK_ISO_8_1;
3120 rejected |= CATEGORY_MASK_ISO_8_2;
3121 break;
3124 detect_info->rejected |= CATEGORY_MASK_ISO;
3125 return 0;
3127 no_more_source:
3128 detect_info->rejected |= rejected;
3129 detect_info->found |= (found & ~rejected);
3130 return 1;
3134 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3135 escape sequence should be kept. */
3136 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3137 do { \
3138 int id, prev; \
3140 if (final < '0' || final >= 128 \
3141 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3142 || !SAFE_CHARSET_P (coding, id)) \
3144 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3145 chars_96 = -1; \
3146 break; \
3148 prev = CODING_ISO_DESIGNATION (coding, reg); \
3149 if (id == charset_jisx0201_roman) \
3151 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3152 id = charset_ascii; \
3154 else if (id == charset_jisx0208_1978) \
3156 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3157 id = charset_jisx0208; \
3159 CODING_ISO_DESIGNATION (coding, reg) = id; \
3160 /* If there was an invalid designation to REG previously, and this \
3161 designation is ASCII to REG, we should keep this designation \
3162 sequence. */ \
3163 if (prev == -2 && id == charset_ascii) \
3164 chars_96 = -1; \
3165 } while (0)
3168 /* Handle these composition sequence (ALT: alternate char):
3170 (1) relative composition: ESC 0 CHAR ... ESC 1
3171 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3172 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3173 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3175 When the start sequence (ESC 0/2/3/4) is found, this annotation
3176 header is produced.
3178 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3180 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3181 produced until the end sequence (ESC 1) is found:
3183 (1) CHAR ... CHAR
3184 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3185 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3186 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3188 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3189 annotation header is updated as below:
3191 (1) LENGTH: unchanged, NCHARS: number of CHARs
3192 (2) LENGTH: unchanged, NCHARS: number of CHARs
3193 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3194 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3196 If an error is found while composing, the annotation header is
3197 changed to:
3199 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3201 and the sequence [ -2 DECODED-RULE ] is changed to the original
3202 byte sequence as below:
3203 o the original byte sequence is B: [ B -1 ]
3204 o the original byte sequence is B1 B2: [ B1 B2 ]
3205 and the sequence [ -1 -1 ] is changed to the original byte
3206 sequence:
3207 [ ESC '0' ]
3210 /* Decode a composition rule C1 and maybe one more byte from the
3211 source, and set RULE to the encoded composition rule. If the rule
3212 is invalid, goto invalid_code. */
3214 #define DECODE_COMPOSITION_RULE(rule) \
3215 do { \
3216 rule = c1 - 32; \
3217 if (rule < 0) \
3218 goto invalid_code; \
3219 if (rule < 81) /* old format (before ver.21) */ \
3221 int gref = (rule) / 9; \
3222 int nref = (rule) % 9; \
3223 if (gref == 4) gref = 10; \
3224 if (nref == 4) nref = 10; \
3225 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3227 else /* new format (after ver.21) */ \
3229 int b; \
3231 ONE_MORE_BYTE (b); \
3232 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3233 goto invalid_code; \
3234 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3235 rule += 0x100; /* Distinguish it from the old format. */ \
3237 } while (0)
3239 #define ENCODE_COMPOSITION_RULE(rule) \
3240 do { \
3241 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3243 if (rule < 0x100) /* old format */ \
3245 if (gref == 10) gref = 4; \
3246 if (nref == 10) nref = 4; \
3247 charbuf[idx] = 32 + gref * 9 + nref; \
3248 charbuf[idx + 1] = -1; \
3249 new_chars++; \
3251 else /* new format */ \
3253 charbuf[idx] = 32 + 81 + gref; \
3254 charbuf[idx + 1] = 32 + nref; \
3255 new_chars += 2; \
3257 } while (0)
3259 /* Finish the current composition as invalid. */
3261 static int
3262 finish_composition (int *charbuf, struct composition_status *cmp_status)
3264 int idx = - cmp_status->length;
3265 int new_chars;
3267 /* Recover the original ESC sequence */
3268 charbuf[idx++] = ISO_CODE_ESC;
3269 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3270 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3271 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3272 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3273 : '4');
3274 charbuf[idx++] = -2;
3275 charbuf[idx++] = 0;
3276 charbuf[idx++] = -1;
3277 new_chars = cmp_status->nchars;
3278 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3279 for (; idx < 0; idx++)
3281 int elt = charbuf[idx];
3283 if (elt == -2)
3285 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3286 idx++;
3288 else if (elt == -1)
3290 charbuf[idx++] = ISO_CODE_ESC;
3291 charbuf[idx] = '0';
3292 new_chars += 2;
3295 cmp_status->state = COMPOSING_NO;
3296 return new_chars;
3299 /* If characters are under composition, finish the composition. */
3300 #define MAYBE_FINISH_COMPOSITION() \
3301 do { \
3302 if (cmp_status->state != COMPOSING_NO) \
3303 char_offset += finish_composition (charbuf, cmp_status); \
3304 } while (0)
3306 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3308 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3309 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3310 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3311 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3313 Produce this annotation sequence now:
3315 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3318 #define DECODE_COMPOSITION_START(c1) \
3319 do { \
3320 if (c1 == '0' \
3321 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3322 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3323 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3324 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3326 *charbuf++ = -1; \
3327 *charbuf++= -1; \
3328 cmp_status->state = COMPOSING_CHAR; \
3329 cmp_status->length += 2; \
3331 else \
3333 MAYBE_FINISH_COMPOSITION (); \
3334 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3335 : c1 == '2' ? COMPOSITION_WITH_RULE \
3336 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3337 : COMPOSITION_WITH_RULE_ALTCHARS); \
3338 cmp_status->state \
3339 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3340 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3341 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3342 cmp_status->nchars = cmp_status->ncomps = 0; \
3343 coding->annotated = 1; \
3345 } while (0)
3348 /* Handle composition end sequence ESC 1. */
3350 #define DECODE_COMPOSITION_END() \
3351 do { \
3352 if (cmp_status->nchars == 0 \
3353 || ((cmp_status->state == COMPOSING_CHAR) \
3354 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3356 MAYBE_FINISH_COMPOSITION (); \
3357 goto invalid_code; \
3359 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3360 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3361 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3362 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3363 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3364 char_offset += cmp_status->nchars; \
3365 cmp_status->state = COMPOSING_NO; \
3366 } while (0)
3368 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3370 #define STORE_COMPOSITION_RULE(rule) \
3371 do { \
3372 *charbuf++ = -2; \
3373 *charbuf++ = rule; \
3374 cmp_status->length += 2; \
3375 cmp_status->state--; \
3376 } while (0)
3378 /* Store a composed char or a component char C in charbuf, and update
3379 cmp_status. */
3381 #define STORE_COMPOSITION_CHAR(c) \
3382 do { \
3383 *charbuf++ = (c); \
3384 cmp_status->length++; \
3385 if (cmp_status->state == COMPOSING_CHAR) \
3386 cmp_status->nchars++; \
3387 else \
3388 cmp_status->ncomps++; \
3389 if (cmp_status->method == COMPOSITION_WITH_RULE \
3390 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3391 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3392 cmp_status->state++; \
3393 } while (0)
3396 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3398 static void
3399 decode_coding_iso_2022 (struct coding_system *coding)
3401 const unsigned char *src = coding->source + coding->consumed;
3402 const unsigned char *src_end = coding->source + coding->src_bytes;
3403 const unsigned char *src_base;
3404 int *charbuf = coding->charbuf + coding->charbuf_used;
3405 /* We may produce two annotations (charset and composition) in one
3406 loop and one more charset annotation at the end. */
3407 int *charbuf_end
3408 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3409 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3410 bool multibytep = coding->src_multibyte;
3411 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3412 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3413 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3414 int charset_id_2, charset_id_3;
3415 struct charset *charset;
3416 int c;
3417 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3418 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3419 ptrdiff_t char_offset = coding->produced_char;
3420 ptrdiff_t last_offset = char_offset;
3421 int last_id = charset_ascii;
3422 bool eol_dos
3423 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3424 int byte_after_cr = -1;
3425 int i;
3427 setup_iso_safe_charsets (attrs);
3428 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3430 if (cmp_status->state != COMPOSING_NO)
3432 if (charbuf_end - charbuf < cmp_status->length)
3433 emacs_abort ();
3434 for (i = 0; i < cmp_status->length; i++)
3435 *charbuf++ = cmp_status->carryover[i];
3436 coding->annotated = 1;
3439 while (1)
3441 int c1, c2, c3;
3443 src_base = src;
3444 consumed_chars_base = consumed_chars;
3446 if (charbuf >= charbuf_end)
3448 if (byte_after_cr >= 0)
3449 src_base--;
3450 break;
3453 if (byte_after_cr >= 0)
3454 c1 = byte_after_cr, byte_after_cr = -1;
3455 else
3456 ONE_MORE_BYTE (c1);
3457 if (c1 < 0)
3458 goto invalid_code;
3460 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3462 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3463 char_offset++;
3464 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3465 continue;
3468 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3470 if (c1 == ISO_CODE_ESC)
3472 if (src + 1 >= src_end)
3473 goto no_more_source;
3474 *charbuf++ = ISO_CODE_ESC;
3475 char_offset++;
3476 if (src[0] == '%' && src[1] == '@')
3478 src += 2;
3479 consumed_chars += 2;
3480 char_offset += 2;
3481 /* We are sure charbuf can contain two more chars. */
3482 *charbuf++ = '%';
3483 *charbuf++ = '@';
3484 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3487 else
3489 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3490 char_offset++;
3492 continue;
3495 if ((cmp_status->state == COMPOSING_RULE
3496 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3497 && c1 != ISO_CODE_ESC)
3499 int rule;
3501 DECODE_COMPOSITION_RULE (rule);
3502 STORE_COMPOSITION_RULE (rule);
3503 continue;
3506 /* We produce at most one character. */
3507 switch (iso_code_class [c1])
3509 case ISO_0x20_or_0x7F:
3510 if (charset_id_0 < 0
3511 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3512 /* This is SPACE or DEL. */
3513 charset = CHARSET_FROM_ID (charset_ascii);
3514 else
3515 charset = CHARSET_FROM_ID (charset_id_0);
3516 break;
3518 case ISO_graphic_plane_0:
3519 if (charset_id_0 < 0)
3520 charset = CHARSET_FROM_ID (charset_ascii);
3521 else
3522 charset = CHARSET_FROM_ID (charset_id_0);
3523 break;
3525 case ISO_0xA0_or_0xFF:
3526 if (charset_id_1 < 0
3527 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3528 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3529 goto invalid_code;
3530 /* This is a graphic character, we fall down ... */
3532 case ISO_graphic_plane_1:
3533 if (charset_id_1 < 0)
3534 goto invalid_code;
3535 charset = CHARSET_FROM_ID (charset_id_1);
3536 break;
3538 case ISO_control_0:
3539 if (eol_dos && c1 == '\r')
3540 ONE_MORE_BYTE (byte_after_cr);
3541 MAYBE_FINISH_COMPOSITION ();
3542 charset = CHARSET_FROM_ID (charset_ascii);
3543 break;
3545 case ISO_control_1:
3546 goto invalid_code;
3548 case ISO_shift_out:
3549 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3550 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3551 goto invalid_code;
3552 CODING_ISO_INVOCATION (coding, 0) = 1;
3553 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3554 continue;
3556 case ISO_shift_in:
3557 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3558 goto invalid_code;
3559 CODING_ISO_INVOCATION (coding, 0) = 0;
3560 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3561 continue;
3563 case ISO_single_shift_2_7:
3564 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3565 goto invalid_code;
3566 case ISO_single_shift_2:
3567 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3568 goto invalid_code;
3569 /* SS2 is handled as an escape sequence of ESC 'N' */
3570 c1 = 'N';
3571 goto label_escape_sequence;
3573 case ISO_single_shift_3:
3574 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3575 goto invalid_code;
3576 /* SS2 is handled as an escape sequence of ESC 'O' */
3577 c1 = 'O';
3578 goto label_escape_sequence;
3580 case ISO_control_sequence_introducer:
3581 /* CSI is handled as an escape sequence of ESC '[' ... */
3582 c1 = '[';
3583 goto label_escape_sequence;
3585 case ISO_escape:
3586 ONE_MORE_BYTE (c1);
3587 label_escape_sequence:
3588 /* Escape sequences handled here are invocation,
3589 designation, direction specification, and character
3590 composition specification. */
3591 switch (c1)
3593 case '&': /* revision of following character set */
3594 ONE_MORE_BYTE (c1);
3595 if (!(c1 >= '@' && c1 <= '~'))
3596 goto invalid_code;
3597 ONE_MORE_BYTE (c1);
3598 if (c1 != ISO_CODE_ESC)
3599 goto invalid_code;
3600 ONE_MORE_BYTE (c1);
3601 goto label_escape_sequence;
3603 case '$': /* designation of 2-byte character set */
3604 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3605 goto invalid_code;
3607 int reg, chars96;
3609 ONE_MORE_BYTE (c1);
3610 if (c1 >= '@' && c1 <= 'B')
3611 { /* designation of JISX0208.1978, GB2312.1980,
3612 or JISX0208.1980 */
3613 reg = 0, chars96 = 0;
3615 else if (c1 >= 0x28 && c1 <= 0x2B)
3616 { /* designation of DIMENSION2_CHARS94 character set */
3617 reg = c1 - 0x28, chars96 = 0;
3618 ONE_MORE_BYTE (c1);
3620 else if (c1 >= 0x2C && c1 <= 0x2F)
3621 { /* designation of DIMENSION2_CHARS96 character set */
3622 reg = c1 - 0x2C, chars96 = 1;
3623 ONE_MORE_BYTE (c1);
3625 else
3626 goto invalid_code;
3627 DECODE_DESIGNATION (reg, 2, chars96, c1);
3628 /* We must update these variables now. */
3629 if (reg == 0)
3630 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3631 else if (reg == 1)
3632 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3633 if (chars96 < 0)
3634 goto invalid_code;
3636 continue;
3638 case 'n': /* invocation of locking-shift-2 */
3639 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3641 goto invalid_code;
3642 CODING_ISO_INVOCATION (coding, 0) = 2;
3643 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644 continue;
3646 case 'o': /* invocation of locking-shift-3 */
3647 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3649 goto invalid_code;
3650 CODING_ISO_INVOCATION (coding, 0) = 3;
3651 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652 continue;
3654 case 'N': /* invocation of single-shift-2 */
3655 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3656 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3657 goto invalid_code;
3658 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3659 if (charset_id_2 < 0)
3660 charset = CHARSET_FROM_ID (charset_ascii);
3661 else
3662 charset = CHARSET_FROM_ID (charset_id_2);
3663 ONE_MORE_BYTE (c1);
3664 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3665 goto invalid_code;
3666 break;
3668 case 'O': /* invocation of single-shift-3 */
3669 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3670 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3671 goto invalid_code;
3672 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3673 if (charset_id_3 < 0)
3674 charset = CHARSET_FROM_ID (charset_ascii);
3675 else
3676 charset = CHARSET_FROM_ID (charset_id_3);
3677 ONE_MORE_BYTE (c1);
3678 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3679 goto invalid_code;
3680 break;
3682 case '0': case '2': case '3': case '4': /* start composition */
3683 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3684 goto invalid_code;
3685 if (last_id != charset_ascii)
3687 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3688 last_id = charset_ascii;
3689 last_offset = char_offset;
3691 DECODE_COMPOSITION_START (c1);
3692 continue;
3694 case '1': /* end composition */
3695 if (cmp_status->state == COMPOSING_NO)
3696 goto invalid_code;
3697 DECODE_COMPOSITION_END ();
3698 continue;
3700 case '[': /* specification of direction */
3701 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3702 goto invalid_code;
3703 /* For the moment, nested direction is not supported.
3704 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3705 left-to-right, and nonzero means right-to-left. */
3706 ONE_MORE_BYTE (c1);
3707 switch (c1)
3709 case ']': /* end of the current direction */
3710 coding->mode &= ~CODING_MODE_DIRECTION;
3712 case '0': /* end of the current direction */
3713 case '1': /* start of left-to-right direction */
3714 ONE_MORE_BYTE (c1);
3715 if (c1 == ']')
3716 coding->mode &= ~CODING_MODE_DIRECTION;
3717 else
3718 goto invalid_code;
3719 break;
3721 case '2': /* start of right-to-left direction */
3722 ONE_MORE_BYTE (c1);
3723 if (c1 == ']')
3724 coding->mode |= CODING_MODE_DIRECTION;
3725 else
3726 goto invalid_code;
3727 break;
3729 default:
3730 goto invalid_code;
3732 continue;
3734 case '%':
3735 ONE_MORE_BYTE (c1);
3736 if (c1 == '/')
3738 /* CTEXT extended segment:
3739 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3740 We keep these bytes as is for the moment.
3741 They may be decoded by post-read-conversion. */
3742 int dim, M, L;
3743 int size;
3745 ONE_MORE_BYTE (dim);
3746 if (dim < '0' || dim > '4')
3747 goto invalid_code;
3748 ONE_MORE_BYTE (M);
3749 if (M < 128)
3750 goto invalid_code;
3751 ONE_MORE_BYTE (L);
3752 if (L < 128)
3753 goto invalid_code;
3754 size = ((M - 128) * 128) + (L - 128);
3755 if (charbuf + 6 > charbuf_end)
3756 goto break_loop;
3757 *charbuf++ = ISO_CODE_ESC;
3758 *charbuf++ = '%';
3759 *charbuf++ = '/';
3760 *charbuf++ = dim;
3761 *charbuf++ = BYTE8_TO_CHAR (M);
3762 *charbuf++ = BYTE8_TO_CHAR (L);
3763 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3765 else if (c1 == 'G')
3767 /* XFree86 extension for embedding UTF-8 in CTEXT:
3768 ESC % G --UTF-8-BYTES-- ESC % @
3769 We keep these bytes as is for the moment.
3770 They may be decoded by post-read-conversion. */
3771 if (charbuf + 3 > charbuf_end)
3772 goto break_loop;
3773 *charbuf++ = ISO_CODE_ESC;
3774 *charbuf++ = '%';
3775 *charbuf++ = 'G';
3776 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3778 else
3779 goto invalid_code;
3780 continue;
3781 break;
3783 default:
3784 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3785 goto invalid_code;
3787 int reg, chars96;
3789 if (c1 >= 0x28 && c1 <= 0x2B)
3790 { /* designation of DIMENSION1_CHARS94 character set */
3791 reg = c1 - 0x28, chars96 = 0;
3792 ONE_MORE_BYTE (c1);
3794 else if (c1 >= 0x2C && c1 <= 0x2F)
3795 { /* designation of DIMENSION1_CHARS96 character set */
3796 reg = c1 - 0x2C, chars96 = 1;
3797 ONE_MORE_BYTE (c1);
3799 else
3800 goto invalid_code;
3801 DECODE_DESIGNATION (reg, 1, chars96, c1);
3802 /* We must update these variables now. */
3803 if (reg == 0)
3804 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3805 else if (reg == 1)
3806 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3807 if (chars96 < 0)
3808 goto invalid_code;
3810 continue;
3812 break;
3814 default:
3815 emacs_abort ();
3818 if (cmp_status->state == COMPOSING_NO
3819 && charset->id != charset_ascii
3820 && last_id != charset->id)
3822 if (last_id != charset_ascii)
3823 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3824 last_id = charset->id;
3825 last_offset = char_offset;
3828 /* Now we know CHARSET and 1st position code C1 of a character.
3829 Produce a decoded character while getting 2nd and 3rd
3830 position codes C2, C3 if necessary. */
3831 if (CHARSET_DIMENSION (charset) > 1)
3833 ONE_MORE_BYTE (c2);
3834 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3835 || ((c1 & 0x80) != (c2 & 0x80)))
3836 /* C2 is not in a valid range. */
3837 goto invalid_code;
3838 if (CHARSET_DIMENSION (charset) == 2)
3839 c1 = (c1 << 8) | c2;
3840 else
3842 ONE_MORE_BYTE (c3);
3843 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3844 || ((c1 & 0x80) != (c3 & 0x80)))
3845 /* C3 is not in a valid range. */
3846 goto invalid_code;
3847 c1 = (c1 << 16) | (c2 << 8) | c2;
3850 c1 &= 0x7F7F7F;
3851 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3852 if (c < 0)
3854 MAYBE_FINISH_COMPOSITION ();
3855 for (; src_base < src; src_base++, char_offset++)
3857 if (ASCII_BYTE_P (*src_base))
3858 *charbuf++ = *src_base;
3859 else
3860 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3863 else if (cmp_status->state == COMPOSING_NO)
3865 *charbuf++ = c;
3866 char_offset++;
3868 else if ((cmp_status->state == COMPOSING_CHAR
3869 ? cmp_status->nchars
3870 : cmp_status->ncomps)
3871 >= MAX_COMPOSITION_COMPONENTS)
3873 /* Too long composition. */
3874 MAYBE_FINISH_COMPOSITION ();
3875 *charbuf++ = c;
3876 char_offset++;
3878 else
3879 STORE_COMPOSITION_CHAR (c);
3880 continue;
3882 invalid_code:
3883 MAYBE_FINISH_COMPOSITION ();
3884 src = src_base;
3885 consumed_chars = consumed_chars_base;
3886 ONE_MORE_BYTE (c);
3887 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888 char_offset++;
3889 coding->errors++;
3890 continue;
3892 break_loop:
3893 break;
3896 no_more_source:
3897 if (cmp_status->state != COMPOSING_NO)
3899 if (coding->mode & CODING_MODE_LAST_BLOCK)
3900 MAYBE_FINISH_COMPOSITION ();
3901 else
3903 charbuf -= cmp_status->length;
3904 for (i = 0; i < cmp_status->length; i++)
3905 cmp_status->carryover[i] = charbuf[i];
3908 else if (last_id != charset_ascii)
3909 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910 coding->consumed_char += consumed_chars_base;
3911 coding->consumed = src_base - coding->source;
3912 coding->charbuf_used = charbuf - coding->charbuf;
3916 /* ISO2022 encoding stuff. */
3919 It is not enough to say just "ISO2022" on encoding, we have to
3920 specify more details. In Emacs, each coding system of ISO2022
3921 variant has the following specifications:
3922 1. Initial designation to G0 thru G3.
3923 2. Allows short-form designation?
3924 3. ASCII should be designated to G0 before control characters?
3925 4. ASCII should be designated to G0 at end of line?
3926 5. 7-bit environment or 8-bit environment?
3927 6. Use locking-shift?
3928 7. Use Single-shift?
3929 And the following two are only for Japanese:
3930 8. Use ASCII in place of JIS0201-1976-Roman?
3931 9. Use JISX0208-1983 in place of JISX0208-1978?
3932 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3933 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3934 details.
3937 /* Produce codes (escape sequence) for designating CHARSET to graphic
3938 register REG at DST, and increment DST. If <final-char> of CHARSET is
3939 '@', 'A', or 'B' and the coding system CODING allows, produce
3940 designation sequence of short-form. */
3942 #define ENCODE_DESIGNATION(charset, reg, coding) \
3943 do { \
3944 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3945 const char *intermediate_char_94 = "()*+"; \
3946 const char *intermediate_char_96 = ",-./"; \
3947 int revision = -1; \
3949 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3950 revision = CHARSET_ISO_REVISION (charset); \
3952 if (revision >= 0) \
3954 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3955 EMIT_ONE_BYTE ('@' + revision); \
3957 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3958 if (CHARSET_DIMENSION (charset) == 1) \
3960 int b; \
3961 if (! CHARSET_ISO_CHARS_96 (charset)) \
3962 b = intermediate_char_94[reg]; \
3963 else \
3964 b = intermediate_char_96[reg]; \
3965 EMIT_ONE_ASCII_BYTE (b); \
3967 else \
3969 EMIT_ONE_ASCII_BYTE ('$'); \
3970 if (! CHARSET_ISO_CHARS_96 (charset)) \
3972 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3973 || reg != 0 \
3974 || final_char < '@' || final_char > 'B') \
3975 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3977 else \
3978 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3980 EMIT_ONE_ASCII_BYTE (final_char); \
3982 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3983 } while (0)
3986 /* The following two macros produce codes (control character or escape
3987 sequence) for ISO2022 single-shift functions (single-shift-2 and
3988 single-shift-3). */
3990 #define ENCODE_SINGLE_SHIFT_2 \
3991 do { \
3992 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3993 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3994 else \
3995 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3996 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3997 } while (0)
4000 #define ENCODE_SINGLE_SHIFT_3 \
4001 do { \
4002 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4003 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4004 else \
4005 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4006 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4007 } while (0)
4010 /* The following four macros produce codes (control character or
4011 escape sequence) for ISO2022 locking-shift functions (shift-in,
4012 shift-out, locking-shift-2, and locking-shift-3). */
4014 #define ENCODE_SHIFT_IN \
4015 do { \
4016 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4017 CODING_ISO_INVOCATION (coding, 0) = 0; \
4018 } while (0)
4021 #define ENCODE_SHIFT_OUT \
4022 do { \
4023 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4024 CODING_ISO_INVOCATION (coding, 0) = 1; \
4025 } while (0)
4028 #define ENCODE_LOCKING_SHIFT_2 \
4029 do { \
4030 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4031 CODING_ISO_INVOCATION (coding, 0) = 2; \
4032 } while (0)
4035 #define ENCODE_LOCKING_SHIFT_3 \
4036 do { \
4037 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4038 CODING_ISO_INVOCATION (coding, 0) = 3; \
4039 } while (0)
4042 /* Produce codes for a DIMENSION1 character whose character set is
4043 CHARSET and whose position-code is C1. Designation and invocation
4044 sequences are also produced in advance if necessary. */
4046 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4047 do { \
4048 int id = CHARSET_ID (charset); \
4050 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4051 && id == charset_ascii) \
4053 id = charset_jisx0201_roman; \
4054 charset = CHARSET_FROM_ID (id); \
4057 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4059 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4060 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4061 else \
4062 EMIT_ONE_BYTE (c1 | 0x80); \
4063 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4064 break; \
4066 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4068 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4069 break; \
4071 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4073 EMIT_ONE_BYTE (c1 | 0x80); \
4074 break; \
4076 else \
4077 /* Since CHARSET is not yet invoked to any graphic planes, we \
4078 must invoke it, or, at first, designate it to some graphic \
4079 register. Then repeat the loop to actually produce the \
4080 character. */ \
4081 dst = encode_invocation_designation (charset, coding, dst, \
4082 &produced_chars); \
4083 } while (1)
4086 /* Produce codes for a DIMENSION2 character whose character set is
4087 CHARSET and whose position-codes are C1 and C2. Designation and
4088 invocation codes are also produced in advance if necessary. */
4090 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4091 do { \
4092 int id = CHARSET_ID (charset); \
4094 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4095 && id == charset_jisx0208) \
4097 id = charset_jisx0208_1978; \
4098 charset = CHARSET_FROM_ID (id); \
4101 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4103 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4104 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4105 else \
4106 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4107 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4108 break; \
4110 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4112 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4113 break; \
4115 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4117 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4118 break; \
4120 else \
4121 /* Since CHARSET is not yet invoked to any graphic planes, we \
4122 must invoke it, or, at first, designate it to some graphic \
4123 register. Then repeat the loop to actually produce the \
4124 character. */ \
4125 dst = encode_invocation_designation (charset, coding, dst, \
4126 &produced_chars); \
4127 } while (1)
4130 #define ENCODE_ISO_CHARACTER(charset, c) \
4131 do { \
4132 unsigned code; \
4133 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4135 if (CHARSET_DIMENSION (charset) == 1) \
4136 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4137 else \
4138 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4139 } while (0)
4142 /* Produce designation and invocation codes at a place pointed by DST
4143 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4144 Return new DST. */
4146 static unsigned char *
4147 encode_invocation_designation (struct charset *charset,
4148 struct coding_system *coding,
4149 unsigned char *dst, ptrdiff_t *p_nchars)
4151 bool multibytep = coding->dst_multibyte;
4152 ptrdiff_t produced_chars = *p_nchars;
4153 int reg; /* graphic register number */
4154 int id = CHARSET_ID (charset);
4156 /* At first, check designations. */
4157 for (reg = 0; reg < 4; reg++)
4158 if (id == CODING_ISO_DESIGNATION (coding, reg))
4159 break;
4161 if (reg >= 4)
4163 /* CHARSET is not yet designated to any graphic registers. */
4164 /* At first check the requested designation. */
4165 reg = CODING_ISO_REQUEST (coding, id);
4166 if (reg < 0)
4167 /* Since CHARSET requests no special designation, designate it
4168 to graphic register 0. */
4169 reg = 0;
4171 ENCODE_DESIGNATION (charset, reg, coding);
4174 if (CODING_ISO_INVOCATION (coding, 0) != reg
4175 && CODING_ISO_INVOCATION (coding, 1) != reg)
4177 /* Since the graphic register REG is not invoked to any graphic
4178 planes, invoke it to graphic plane 0. */
4179 switch (reg)
4181 case 0: /* graphic register 0 */
4182 ENCODE_SHIFT_IN;
4183 break;
4185 case 1: /* graphic register 1 */
4186 ENCODE_SHIFT_OUT;
4187 break;
4189 case 2: /* graphic register 2 */
4190 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4191 ENCODE_SINGLE_SHIFT_2;
4192 else
4193 ENCODE_LOCKING_SHIFT_2;
4194 break;
4196 case 3: /* graphic register 3 */
4197 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4198 ENCODE_SINGLE_SHIFT_3;
4199 else
4200 ENCODE_LOCKING_SHIFT_3;
4201 break;
4205 *p_nchars = produced_chars;
4206 return dst;
4210 /* Produce codes for designation and invocation to reset the graphic
4211 planes and registers to initial state. */
4212 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4213 do { \
4214 int reg; \
4215 struct charset *charset; \
4217 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4218 ENCODE_SHIFT_IN; \
4219 for (reg = 0; reg < 4; reg++) \
4220 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4221 && (CODING_ISO_DESIGNATION (coding, reg) \
4222 != CODING_ISO_INITIAL (coding, reg))) \
4224 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4225 ENCODE_DESIGNATION (charset, reg, coding); \
4227 } while (0)
4230 /* Produce designation sequences of charsets in the line started from
4231 CHARBUF to a place pointed by DST, and return the number of
4232 produced bytes. DST should not directly point a buffer text area
4233 which may be relocated by char_charset call.
4235 If the current block ends before any end-of-line, we may fail to
4236 find all the necessary designations. */
4238 static ptrdiff_t
4239 encode_designation_at_bol (struct coding_system *coding,
4240 int *charbuf, int *charbuf_end,
4241 unsigned char *dst)
4243 unsigned char *orig = dst;
4244 struct charset *charset;
4245 /* Table of charsets to be designated to each graphic register. */
4246 int r[4];
4247 int c, found = 0, reg;
4248 ptrdiff_t produced_chars = 0;
4249 bool multibytep = coding->dst_multibyte;
4250 Lisp_Object attrs;
4251 Lisp_Object charset_list;
4253 attrs = CODING_ID_ATTRS (coding->id);
4254 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4255 if (EQ (charset_list, Qiso_2022))
4256 charset_list = Viso_2022_charset_list;
4258 for (reg = 0; reg < 4; reg++)
4259 r[reg] = -1;
4261 while (charbuf < charbuf_end && found < 4)
4263 int id;
4265 c = *charbuf++;
4266 if (c == '\n')
4267 break;
4268 charset = char_charset (c, charset_list, NULL);
4269 id = CHARSET_ID (charset);
4270 reg = CODING_ISO_REQUEST (coding, id);
4271 if (reg >= 0 && r[reg] < 0)
4273 found++;
4274 r[reg] = id;
4278 if (found)
4280 for (reg = 0; reg < 4; reg++)
4281 if (r[reg] >= 0
4282 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4283 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4286 return dst - orig;
4289 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4291 static bool
4292 encode_coding_iso_2022 (struct coding_system *coding)
4294 bool multibytep = coding->dst_multibyte;
4295 int *charbuf = coding->charbuf;
4296 int *charbuf_end = charbuf + coding->charbuf_used;
4297 unsigned char *dst = coding->destination + coding->produced;
4298 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4299 int safe_room = 16;
4300 bool bol_designation
4301 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4302 && CODING_ISO_BOL (coding));
4303 ptrdiff_t produced_chars = 0;
4304 Lisp_Object attrs, eol_type, charset_list;
4305 bool ascii_compatible;
4306 int c;
4307 int preferred_charset_id = -1;
4309 CODING_GET_INFO (coding, attrs, charset_list);
4310 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4311 if (VECTORP (eol_type))
4312 eol_type = Qunix;
4314 setup_iso_safe_charsets (attrs);
4315 /* Charset list may have been changed. */
4316 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4317 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4319 ascii_compatible
4320 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4321 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4322 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4324 while (charbuf < charbuf_end)
4326 ASSURE_DESTINATION (safe_room);
4328 if (bol_designation)
4330 /* We have to produce designation sequences if any now. */
4331 unsigned char desig_buf[16];
4332 int nbytes;
4333 ptrdiff_t offset;
4335 charset_map_loaded = 0;
4336 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4337 desig_buf);
4338 if (charset_map_loaded
4339 && (offset = coding_change_destination (coding)))
4341 dst += offset;
4342 dst_end += offset;
4344 memcpy (dst, desig_buf, nbytes);
4345 dst += nbytes;
4346 /* We are sure that designation sequences are all ASCII bytes. */
4347 produced_chars += nbytes;
4348 bol_designation = 0;
4349 ASSURE_DESTINATION (safe_room);
4352 c = *charbuf++;
4354 if (c < 0)
4356 /* Handle an annotation. */
4357 switch (*charbuf)
4359 case CODING_ANNOTATE_COMPOSITION_MASK:
4360 /* Not yet implemented. */
4361 break;
4362 case CODING_ANNOTATE_CHARSET_MASK:
4363 preferred_charset_id = charbuf[2];
4364 if (preferred_charset_id >= 0
4365 && NILP (Fmemq (make_number (preferred_charset_id),
4366 charset_list)))
4367 preferred_charset_id = -1;
4368 break;
4369 default:
4370 emacs_abort ();
4372 charbuf += -c - 1;
4373 continue;
4376 /* Now encode the character C. */
4377 if (c < 0x20 || c == 0x7F)
4379 if (c == '\n'
4380 || (c == '\r' && EQ (eol_type, Qmac)))
4382 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4383 ENCODE_RESET_PLANE_AND_REGISTER ();
4384 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4386 int i;
4388 for (i = 0; i < 4; i++)
4389 CODING_ISO_DESIGNATION (coding, i)
4390 = CODING_ISO_INITIAL (coding, i);
4392 bol_designation = ((CODING_ISO_FLAGS (coding)
4393 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4394 != 0);
4396 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4397 ENCODE_RESET_PLANE_AND_REGISTER ();
4398 EMIT_ONE_ASCII_BYTE (c);
4400 else if (ASCII_CHAR_P (c))
4402 if (ascii_compatible)
4403 EMIT_ONE_ASCII_BYTE (c);
4404 else
4406 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4407 ENCODE_ISO_CHARACTER (charset, c);
4410 else if (CHAR_BYTE8_P (c))
4412 c = CHAR_TO_BYTE8 (c);
4413 EMIT_ONE_BYTE (c);
4415 else
4417 struct charset *charset;
4419 if (preferred_charset_id >= 0)
4421 bool result;
4423 charset = CHARSET_FROM_ID (preferred_charset_id);
4424 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4425 if (! result)
4426 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4427 NULL, charset);
4429 else
4430 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431 NULL, charset);
4432 if (!charset)
4434 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4436 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4437 charset = CHARSET_FROM_ID (charset_ascii);
4439 else
4441 c = coding->default_char;
4442 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4443 charset_list, NULL, charset);
4446 ENCODE_ISO_CHARACTER (charset, c);
4450 if (coding->mode & CODING_MODE_LAST_BLOCK
4451 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4453 ASSURE_DESTINATION (safe_room);
4454 ENCODE_RESET_PLANE_AND_REGISTER ();
4456 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4457 CODING_ISO_BOL (coding) = bol_designation;
4458 coding->produced_char += produced_chars;
4459 coding->produced = dst - coding->destination;
4460 return 0;
4464 /*** 8,9. SJIS and BIG5 handlers ***/
4466 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4467 quite widely. So, for the moment, Emacs supports them in the bare
4468 C code. But, in the future, they may be supported only by CCL. */
4470 /* SJIS is a coding system encoding three character sets: ASCII, right
4471 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4472 as is. A character of charset katakana-jisx0201 is encoded by
4473 "position-code + 0x80". A character of charset japanese-jisx0208
4474 is encoded in 2-byte but two position-codes are divided and shifted
4475 so that it fit in the range below.
4477 --- CODE RANGE of SJIS ---
4478 (character set) (range)
4479 ASCII 0x00 .. 0x7F
4480 KATAKANA-JISX0201 0xA0 .. 0xDF
4481 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4482 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4483 -------------------------------
4487 /* BIG5 is a coding system encoding two character sets: ASCII and
4488 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4489 character set and is encoded in two-byte.
4491 --- CODE RANGE of BIG5 ---
4492 (character set) (range)
4493 ASCII 0x00 .. 0x7F
4494 Big5 (1st byte) 0xA1 .. 0xFE
4495 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4496 --------------------------
4500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4501 Return true if a text is encoded in SJIS. */
4503 static bool
4504 detect_coding_sjis (struct coding_system *coding,
4505 struct coding_detection_info *detect_info)
4507 const unsigned char *src = coding->source, *src_base;
4508 const unsigned char *src_end = coding->source + coding->src_bytes;
4509 bool multibytep = coding->src_multibyte;
4510 ptrdiff_t consumed_chars = 0;
4511 int found = 0;
4512 int c;
4513 Lisp_Object attrs, charset_list;
4514 int max_first_byte_of_2_byte_code;
4516 CODING_GET_INFO (coding, attrs, charset_list);
4517 max_first_byte_of_2_byte_code
4518 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4520 detect_info->checked |= CATEGORY_MASK_SJIS;
4521 /* A coding system of this category is always ASCII compatible. */
4522 src += coding->head_ascii;
4524 while (1)
4526 src_base = src;
4527 ONE_MORE_BYTE (c);
4528 if (c < 0x80)
4529 continue;
4530 if ((c >= 0x81 && c <= 0x9F)
4531 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4533 ONE_MORE_BYTE (c);
4534 if (c < 0x40 || c == 0x7F || c > 0xFC)
4535 break;
4536 found = CATEGORY_MASK_SJIS;
4538 else if (c >= 0xA0 && c < 0xE0)
4539 found = CATEGORY_MASK_SJIS;
4540 else
4541 break;
4543 detect_info->rejected |= CATEGORY_MASK_SJIS;
4544 return 0;
4546 no_more_source:
4547 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4549 detect_info->rejected |= CATEGORY_MASK_SJIS;
4550 return 0;
4552 detect_info->found |= found;
4553 return 1;
4556 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4557 Return true if a text is encoded in BIG5. */
4559 static bool
4560 detect_coding_big5 (struct coding_system *coding,
4561 struct coding_detection_info *detect_info)
4563 const unsigned char *src = coding->source, *src_base;
4564 const unsigned char *src_end = coding->source + coding->src_bytes;
4565 bool multibytep = coding->src_multibyte;
4566 ptrdiff_t consumed_chars = 0;
4567 int found = 0;
4568 int c;
4570 detect_info->checked |= CATEGORY_MASK_BIG5;
4571 /* A coding system of this category is always ASCII compatible. */
4572 src += coding->head_ascii;
4574 while (1)
4576 src_base = src;
4577 ONE_MORE_BYTE (c);
4578 if (c < 0x80)
4579 continue;
4580 if (c >= 0xA1)
4582 ONE_MORE_BYTE (c);
4583 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4584 return 0;
4585 found = CATEGORY_MASK_BIG5;
4587 else
4588 break;
4590 detect_info->rejected |= CATEGORY_MASK_BIG5;
4591 return 0;
4593 no_more_source:
4594 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4596 detect_info->rejected |= CATEGORY_MASK_BIG5;
4597 return 0;
4599 detect_info->found |= found;
4600 return 1;
4603 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4605 static void
4606 decode_coding_sjis (struct coding_system *coding)
4608 const unsigned char *src = coding->source + coding->consumed;
4609 const unsigned char *src_end = coding->source + coding->src_bytes;
4610 const unsigned char *src_base;
4611 int *charbuf = coding->charbuf + coding->charbuf_used;
4612 /* We may produce one charset annotation in one loop and one more at
4613 the end. */
4614 int *charbuf_end
4615 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4616 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4617 bool multibytep = coding->src_multibyte;
4618 struct charset *charset_roman, *charset_kanji, *charset_kana;
4619 struct charset *charset_kanji2;
4620 Lisp_Object attrs, charset_list, val;
4621 ptrdiff_t char_offset = coding->produced_char;
4622 ptrdiff_t last_offset = char_offset;
4623 int last_id = charset_ascii;
4624 bool eol_dos
4625 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4626 int byte_after_cr = -1;
4628 CODING_GET_INFO (coding, attrs, charset_list);
4630 val = charset_list;
4631 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4632 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4633 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4636 while (1)
4638 int c, c1;
4639 struct charset *charset;
4641 src_base = src;
4642 consumed_chars_base = consumed_chars;
4644 if (charbuf >= charbuf_end)
4646 if (byte_after_cr >= 0)
4647 src_base--;
4648 break;
4651 if (byte_after_cr >= 0)
4652 c = byte_after_cr, byte_after_cr = -1;
4653 else
4654 ONE_MORE_BYTE (c);
4655 if (c < 0)
4656 goto invalid_code;
4657 if (c < 0x80)
4659 if (eol_dos && c == '\r')
4660 ONE_MORE_BYTE (byte_after_cr);
4661 charset = charset_roman;
4663 else if (c == 0x80 || c == 0xA0)
4664 goto invalid_code;
4665 else if (c >= 0xA1 && c <= 0xDF)
4667 /* SJIS -> JISX0201-Kana */
4668 c &= 0x7F;
4669 charset = charset_kana;
4671 else if (c <= 0xEF)
4673 /* SJIS -> JISX0208 */
4674 ONE_MORE_BYTE (c1);
4675 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4676 goto invalid_code;
4677 c = (c << 8) | c1;
4678 SJIS_TO_JIS (c);
4679 charset = charset_kanji;
4681 else if (c <= 0xFC && charset_kanji2)
4683 /* SJIS -> JISX0213-2 */
4684 ONE_MORE_BYTE (c1);
4685 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4686 goto invalid_code;
4687 c = (c << 8) | c1;
4688 SJIS_TO_JIS2 (c);
4689 charset = charset_kanji2;
4691 else
4692 goto invalid_code;
4693 if (charset->id != charset_ascii
4694 && last_id != charset->id)
4696 if (last_id != charset_ascii)
4697 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4698 last_id = charset->id;
4699 last_offset = char_offset;
4701 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4702 *charbuf++ = c;
4703 char_offset++;
4704 continue;
4706 invalid_code:
4707 src = src_base;
4708 consumed_chars = consumed_chars_base;
4709 ONE_MORE_BYTE (c);
4710 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4711 char_offset++;
4712 coding->errors++;
4715 no_more_source:
4716 if (last_id != charset_ascii)
4717 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4718 coding->consumed_char += consumed_chars_base;
4719 coding->consumed = src_base - coding->source;
4720 coding->charbuf_used = charbuf - coding->charbuf;
4723 static void
4724 decode_coding_big5 (struct coding_system *coding)
4726 const unsigned char *src = coding->source + coding->consumed;
4727 const unsigned char *src_end = coding->source + coding->src_bytes;
4728 const unsigned char *src_base;
4729 int *charbuf = coding->charbuf + coding->charbuf_used;
4730 /* We may produce one charset annotation in one loop and one more at
4731 the end. */
4732 int *charbuf_end
4733 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4734 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4735 bool multibytep = coding->src_multibyte;
4736 struct charset *charset_roman, *charset_big5;
4737 Lisp_Object attrs, charset_list, val;
4738 ptrdiff_t char_offset = coding->produced_char;
4739 ptrdiff_t last_offset = char_offset;
4740 int last_id = charset_ascii;
4741 bool eol_dos
4742 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4743 int byte_after_cr = -1;
4745 CODING_GET_INFO (coding, attrs, charset_list);
4746 val = charset_list;
4747 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4748 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4750 while (1)
4752 int c, c1;
4753 struct charset *charset;
4755 src_base = src;
4756 consumed_chars_base = consumed_chars;
4758 if (charbuf >= charbuf_end)
4760 if (byte_after_cr >= 0)
4761 src_base--;
4762 break;
4765 if (byte_after_cr >= 0)
4766 c = byte_after_cr, byte_after_cr = -1;
4767 else
4768 ONE_MORE_BYTE (c);
4770 if (c < 0)
4771 goto invalid_code;
4772 if (c < 0x80)
4774 if (eol_dos && c == '\r')
4775 ONE_MORE_BYTE (byte_after_cr);
4776 charset = charset_roman;
4778 else
4780 /* BIG5 -> Big5 */
4781 if (c < 0xA1 || c > 0xFE)
4782 goto invalid_code;
4783 ONE_MORE_BYTE (c1);
4784 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4785 goto invalid_code;
4786 c = c << 8 | c1;
4787 charset = charset_big5;
4789 if (charset->id != charset_ascii
4790 && last_id != charset->id)
4792 if (last_id != charset_ascii)
4793 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4794 last_id = charset->id;
4795 last_offset = char_offset;
4797 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4798 *charbuf++ = c;
4799 char_offset++;
4800 continue;
4802 invalid_code:
4803 src = src_base;
4804 consumed_chars = consumed_chars_base;
4805 ONE_MORE_BYTE (c);
4806 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4807 char_offset++;
4808 coding->errors++;
4811 no_more_source:
4812 if (last_id != charset_ascii)
4813 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814 coding->consumed_char += consumed_chars_base;
4815 coding->consumed = src_base - coding->source;
4816 coding->charbuf_used = charbuf - coding->charbuf;
4819 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4820 This function can encode charsets `ascii', `katakana-jisx0201',
4821 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4822 are sure that all these charsets are registered as official charset
4823 (i.e. do not have extended leading-codes). Characters of other
4824 charsets are produced without any encoding. */
4826 static bool
4827 encode_coding_sjis (struct coding_system *coding)
4829 bool multibytep = coding->dst_multibyte;
4830 int *charbuf = coding->charbuf;
4831 int *charbuf_end = charbuf + coding->charbuf_used;
4832 unsigned char *dst = coding->destination + coding->produced;
4833 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4834 int safe_room = 4;
4835 ptrdiff_t produced_chars = 0;
4836 Lisp_Object attrs, charset_list, val;
4837 bool ascii_compatible;
4838 struct charset *charset_kanji, *charset_kana;
4839 struct charset *charset_kanji2;
4840 int c;
4842 CODING_GET_INFO (coding, attrs, charset_list);
4843 val = XCDR (charset_list);
4844 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4848 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4850 while (charbuf < charbuf_end)
4852 ASSURE_DESTINATION (safe_room);
4853 c = *charbuf++;
4854 /* Now encode the character C. */
4855 if (ASCII_CHAR_P (c) && ascii_compatible)
4856 EMIT_ONE_ASCII_BYTE (c);
4857 else if (CHAR_BYTE8_P (c))
4859 c = CHAR_TO_BYTE8 (c);
4860 EMIT_ONE_BYTE (c);
4862 else
4864 unsigned code;
4865 struct charset *charset;
4866 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4867 &code, charset);
4869 if (!charset)
4871 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4873 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4874 charset = CHARSET_FROM_ID (charset_ascii);
4876 else
4878 c = coding->default_char;
4879 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4880 charset_list, &code, charset);
4883 if (code == CHARSET_INVALID_CODE (charset))
4884 emacs_abort ();
4885 if (charset == charset_kanji)
4887 int c1, c2;
4888 JIS_TO_SJIS (code);
4889 c1 = code >> 8, c2 = code & 0xFF;
4890 EMIT_TWO_BYTES (c1, c2);
4892 else if (charset == charset_kana)
4893 EMIT_ONE_BYTE (code | 0x80);
4894 else if (charset_kanji2 && charset == charset_kanji2)
4896 int c1, c2;
4898 c1 = code >> 8;
4899 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4900 || c1 == 0x28
4901 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4903 JIS_TO_SJIS2 (code);
4904 c1 = code >> 8, c2 = code & 0xFF;
4905 EMIT_TWO_BYTES (c1, c2);
4907 else
4908 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4910 else
4911 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4914 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4915 coding->produced_char += produced_chars;
4916 coding->produced = dst - coding->destination;
4917 return 0;
4920 static bool
4921 encode_coding_big5 (struct coding_system *coding)
4923 bool multibytep = coding->dst_multibyte;
4924 int *charbuf = coding->charbuf;
4925 int *charbuf_end = charbuf + coding->charbuf_used;
4926 unsigned char *dst = coding->destination + coding->produced;
4927 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4928 int safe_room = 4;
4929 ptrdiff_t produced_chars = 0;
4930 Lisp_Object attrs, charset_list, val;
4931 bool ascii_compatible;
4932 struct charset *charset_big5;
4933 int c;
4935 CODING_GET_INFO (coding, attrs, charset_list);
4936 val = XCDR (charset_list);
4937 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4938 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4940 while (charbuf < charbuf_end)
4942 ASSURE_DESTINATION (safe_room);
4943 c = *charbuf++;
4944 /* Now encode the character C. */
4945 if (ASCII_CHAR_P (c) && ascii_compatible)
4946 EMIT_ONE_ASCII_BYTE (c);
4947 else if (CHAR_BYTE8_P (c))
4949 c = CHAR_TO_BYTE8 (c);
4950 EMIT_ONE_BYTE (c);
4952 else
4954 unsigned code;
4955 struct charset *charset;
4956 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4957 &code, charset);
4959 if (! charset)
4961 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4963 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4964 charset = CHARSET_FROM_ID (charset_ascii);
4966 else
4968 c = coding->default_char;
4969 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4970 charset_list, &code, charset);
4973 if (code == CHARSET_INVALID_CODE (charset))
4974 emacs_abort ();
4975 if (charset == charset_big5)
4977 int c1, c2;
4979 c1 = code >> 8, c2 = code & 0xFF;
4980 EMIT_TWO_BYTES (c1, c2);
4982 else
4983 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4986 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4987 coding->produced_char += produced_chars;
4988 coding->produced = dst - coding->destination;
4989 return 0;
4993 /*** 10. CCL handlers ***/
4995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4996 Return true if a text is encoded in a coding system of which
4997 encoder/decoder are written in CCL program. */
4999 static bool
5000 detect_coding_ccl (struct coding_system *coding,
5001 struct coding_detection_info *detect_info)
5003 const unsigned char *src = coding->source, *src_base;
5004 const unsigned char *src_end = coding->source + coding->src_bytes;
5005 bool multibytep = coding->src_multibyte;
5006 ptrdiff_t consumed_chars = 0;
5007 int found = 0;
5008 unsigned char *valids;
5009 ptrdiff_t head_ascii = coding->head_ascii;
5010 Lisp_Object attrs;
5012 detect_info->checked |= CATEGORY_MASK_CCL;
5014 coding = &coding_categories[coding_category_ccl];
5015 valids = CODING_CCL_VALIDS (coding);
5016 attrs = CODING_ID_ATTRS (coding->id);
5017 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5018 src += head_ascii;
5020 while (1)
5022 int c;
5024 src_base = src;
5025 ONE_MORE_BYTE (c);
5026 if (c < 0 || ! valids[c])
5027 break;
5028 if ((valids[c] > 1))
5029 found = CATEGORY_MASK_CCL;
5031 detect_info->rejected |= CATEGORY_MASK_CCL;
5032 return 0;
5034 no_more_source:
5035 detect_info->found |= found;
5036 return 1;
5039 static void
5040 decode_coding_ccl (struct coding_system *coding)
5042 const unsigned char *src = coding->source + coding->consumed;
5043 const unsigned char *src_end = coding->source + coding->src_bytes;
5044 int *charbuf = coding->charbuf + coding->charbuf_used;
5045 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5046 ptrdiff_t consumed_chars = 0;
5047 bool multibytep = coding->src_multibyte;
5048 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5049 int source_charbuf[1024];
5050 int source_byteidx[1025];
5051 Lisp_Object attrs, charset_list;
5053 CODING_GET_INFO (coding, attrs, charset_list);
5055 while (1)
5057 const unsigned char *p = src;
5058 ptrdiff_t offset;
5059 int i = 0;
5061 if (multibytep)
5063 while (i < 1024 && p < src_end)
5065 source_byteidx[i] = p - src;
5066 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5068 source_byteidx[i] = p - src;
5070 else
5071 while (i < 1024 && p < src_end)
5072 source_charbuf[i++] = *p++;
5074 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5075 ccl->last_block = 1;
5076 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5077 charset_map_loaded = 0;
5078 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5079 charset_list);
5080 if (charset_map_loaded
5081 && (offset = coding_change_source (coding)))
5083 p += offset;
5084 src += offset;
5085 src_end += offset;
5087 charbuf += ccl->produced;
5088 if (multibytep)
5089 src += source_byteidx[ccl->consumed];
5090 else
5091 src += ccl->consumed;
5092 consumed_chars += ccl->consumed;
5093 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5094 break;
5097 switch (ccl->status)
5099 case CCL_STAT_SUSPEND_BY_SRC:
5100 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5101 break;
5102 case CCL_STAT_SUSPEND_BY_DST:
5103 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5104 break;
5105 case CCL_STAT_QUIT:
5106 case CCL_STAT_INVALID_CMD:
5107 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5108 break;
5109 default:
5110 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111 break;
5113 coding->consumed_char += consumed_chars;
5114 coding->consumed = src - coding->source;
5115 coding->charbuf_used = charbuf - coding->charbuf;
5118 static bool
5119 encode_coding_ccl (struct coding_system *coding)
5121 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5122 bool multibytep = coding->dst_multibyte;
5123 int *charbuf = coding->charbuf;
5124 int *charbuf_end = charbuf + coding->charbuf_used;
5125 unsigned char *dst = coding->destination + coding->produced;
5126 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5127 int destination_charbuf[1024];
5128 ptrdiff_t produced_chars = 0;
5129 int i;
5130 Lisp_Object attrs, charset_list;
5132 CODING_GET_INFO (coding, attrs, charset_list);
5133 if (coding->consumed_char == coding->src_chars
5134 && coding->mode & CODING_MODE_LAST_BLOCK)
5135 ccl->last_block = 1;
5139 ptrdiff_t offset;
5141 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5142 charset_map_loaded = 0;
5143 ccl_driver (ccl, charbuf, destination_charbuf,
5144 charbuf_end - charbuf, 1024, charset_list);
5145 if (charset_map_loaded
5146 && (offset = coding_change_destination (coding)))
5147 dst += offset;
5148 if (multibytep)
5150 ASSURE_DESTINATION (ccl->produced * 2);
5151 for (i = 0; i < ccl->produced; i++)
5152 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5154 else
5156 ASSURE_DESTINATION (ccl->produced);
5157 for (i = 0; i < ccl->produced; i++)
5158 *dst++ = destination_charbuf[i] & 0xFF;
5159 produced_chars += ccl->produced;
5161 charbuf += ccl->consumed;
5162 if (ccl->status == CCL_STAT_QUIT
5163 || ccl->status == CCL_STAT_INVALID_CMD)
5164 break;
5166 while (charbuf < charbuf_end);
5168 switch (ccl->status)
5170 case CCL_STAT_SUSPEND_BY_SRC:
5171 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5172 break;
5173 case CCL_STAT_SUSPEND_BY_DST:
5174 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5175 break;
5176 case CCL_STAT_QUIT:
5177 case CCL_STAT_INVALID_CMD:
5178 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5179 break;
5180 default:
5181 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182 break;
5185 coding->produced_char += produced_chars;
5186 coding->produced = dst - coding->destination;
5187 return 0;
5191 /*** 10, 11. no-conversion handlers ***/
5193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5195 static void
5196 decode_coding_raw_text (struct coding_system *coding)
5198 bool eol_dos
5199 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5201 coding->chars_at_source = 1;
5202 coding->consumed_char = coding->src_chars;
5203 coding->consumed = coding->src_bytes;
5204 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5206 coding->consumed_char--;
5207 coding->consumed--;
5208 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5210 else
5211 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5214 static bool
5215 encode_coding_raw_text (struct coding_system *coding)
5217 bool multibytep = coding->dst_multibyte;
5218 int *charbuf = coding->charbuf;
5219 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5220 unsigned char *dst = coding->destination + coding->produced;
5221 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222 ptrdiff_t produced_chars = 0;
5223 int c;
5225 if (multibytep)
5227 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5229 if (coding->src_multibyte)
5230 while (charbuf < charbuf_end)
5232 ASSURE_DESTINATION (safe_room);
5233 c = *charbuf++;
5234 if (ASCII_CHAR_P (c))
5235 EMIT_ONE_ASCII_BYTE (c);
5236 else if (CHAR_BYTE8_P (c))
5238 c = CHAR_TO_BYTE8 (c);
5239 EMIT_ONE_BYTE (c);
5241 else
5243 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5245 CHAR_STRING_ADVANCE (c, p1);
5248 EMIT_ONE_BYTE (*p0);
5249 p0++;
5251 while (p0 < p1);
5254 else
5255 while (charbuf < charbuf_end)
5257 ASSURE_DESTINATION (safe_room);
5258 c = *charbuf++;
5259 EMIT_ONE_BYTE (c);
5262 else
5264 if (coding->src_multibyte)
5266 int safe_room = MAX_MULTIBYTE_LENGTH;
5268 while (charbuf < charbuf_end)
5270 ASSURE_DESTINATION (safe_room);
5271 c = *charbuf++;
5272 if (ASCII_CHAR_P (c))
5273 *dst++ = c;
5274 else if (CHAR_BYTE8_P (c))
5275 *dst++ = CHAR_TO_BYTE8 (c);
5276 else
5277 CHAR_STRING_ADVANCE (c, dst);
5280 else
5282 ASSURE_DESTINATION (charbuf_end - charbuf);
5283 while (charbuf < charbuf_end && dst < dst_end)
5284 *dst++ = *charbuf++;
5286 produced_chars = dst - (coding->destination + coding->produced);
5288 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5289 coding->produced_char += produced_chars;
5290 coding->produced = dst - coding->destination;
5291 return 0;
5294 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5295 Return true if a text is encoded in a charset-based coding system. */
5297 static bool
5298 detect_coding_charset (struct coding_system *coding,
5299 struct coding_detection_info *detect_info)
5301 const unsigned char *src = coding->source, *src_base;
5302 const unsigned char *src_end = coding->source + coding->src_bytes;
5303 bool multibytep = coding->src_multibyte;
5304 ptrdiff_t consumed_chars = 0;
5305 Lisp_Object attrs, valids, name;
5306 int found = 0;
5307 ptrdiff_t head_ascii = coding->head_ascii;
5308 bool check_latin_extra = 0;
5310 detect_info->checked |= CATEGORY_MASK_CHARSET;
5312 coding = &coding_categories[coding_category_charset];
5313 attrs = CODING_ID_ATTRS (coding->id);
5314 valids = AREF (attrs, coding_attr_charset_valids);
5315 name = CODING_ID_NAME (coding->id);
5316 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5317 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5318 || strncmp (SSDATA (SYMBOL_NAME (name)),
5319 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5320 check_latin_extra = 1;
5322 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5323 src += head_ascii;
5325 while (1)
5327 int c;
5328 Lisp_Object val;
5329 struct charset *charset;
5330 int dim, idx;
5332 src_base = src;
5333 ONE_MORE_BYTE (c);
5334 if (c < 0)
5335 continue;
5336 val = AREF (valids, c);
5337 if (NILP (val))
5338 break;
5339 if (c >= 0x80)
5341 if (c < 0xA0
5342 && check_latin_extra
5343 && (!VECTORP (Vlatin_extra_code_table)
5344 || NILP (AREF (Vlatin_extra_code_table, c))))
5345 break;
5346 found = CATEGORY_MASK_CHARSET;
5348 if (INTEGERP (val))
5350 charset = CHARSET_FROM_ID (XFASTINT (val));
5351 dim = CHARSET_DIMENSION (charset);
5352 for (idx = 1; idx < dim; idx++)
5354 if (src == src_end)
5355 goto too_short;
5356 ONE_MORE_BYTE (c);
5357 if (c < charset->code_space[(dim - 1 - idx) * 4]
5358 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5359 break;
5361 if (idx < dim)
5362 break;
5364 else
5366 idx = 1;
5367 for (; CONSP (val); val = XCDR (val))
5369 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5370 dim = CHARSET_DIMENSION (charset);
5371 while (idx < dim)
5373 if (src == src_end)
5374 goto too_short;
5375 ONE_MORE_BYTE (c);
5376 if (c < charset->code_space[(dim - 1 - idx) * 4]
5377 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5378 break;
5379 idx++;
5381 if (idx == dim)
5383 val = Qnil;
5384 break;
5387 if (CONSP (val))
5388 break;
5391 too_short:
5392 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5393 return 0;
5395 no_more_source:
5396 detect_info->found |= found;
5397 return 1;
5400 static void
5401 decode_coding_charset (struct coding_system *coding)
5403 const unsigned char *src = coding->source + coding->consumed;
5404 const unsigned char *src_end = coding->source + coding->src_bytes;
5405 const unsigned char *src_base;
5406 int *charbuf = coding->charbuf + coding->charbuf_used;
5407 /* We may produce one charset annotation in one loop and one more at
5408 the end. */
5409 int *charbuf_end
5410 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5411 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5412 bool multibytep = coding->src_multibyte;
5413 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5414 Lisp_Object valids;
5415 ptrdiff_t char_offset = coding->produced_char;
5416 ptrdiff_t last_offset = char_offset;
5417 int last_id = charset_ascii;
5418 bool eol_dos
5419 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5420 int byte_after_cr = -1;
5422 valids = AREF (attrs, coding_attr_charset_valids);
5424 while (1)
5426 int c;
5427 Lisp_Object val;
5428 struct charset *charset;
5429 int dim;
5430 int len = 1;
5431 unsigned code;
5433 src_base = src;
5434 consumed_chars_base = consumed_chars;
5436 if (charbuf >= charbuf_end)
5438 if (byte_after_cr >= 0)
5439 src_base--;
5440 break;
5443 if (byte_after_cr >= 0)
5445 c = byte_after_cr;
5446 byte_after_cr = -1;
5448 else
5450 ONE_MORE_BYTE (c);
5451 if (eol_dos && c == '\r')
5452 ONE_MORE_BYTE (byte_after_cr);
5454 if (c < 0)
5455 goto invalid_code;
5456 code = c;
5458 val = AREF (valids, c);
5459 if (! INTEGERP (val) && ! CONSP (val))
5460 goto invalid_code;
5461 if (INTEGERP (val))
5463 charset = CHARSET_FROM_ID (XFASTINT (val));
5464 dim = CHARSET_DIMENSION (charset);
5465 while (len < dim)
5467 ONE_MORE_BYTE (c);
5468 code = (code << 8) | c;
5469 len++;
5471 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5472 charset, code, c);
5474 else
5476 /* VAL is a list of charset IDs. It is assured that the
5477 list is sorted by charset dimensions (smaller one
5478 comes first). */
5479 while (CONSP (val))
5481 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5482 dim = CHARSET_DIMENSION (charset);
5483 while (len < dim)
5485 ONE_MORE_BYTE (c);
5486 code = (code << 8) | c;
5487 len++;
5489 CODING_DECODE_CHAR (coding, src, src_base,
5490 src_end, charset, code, c);
5491 if (c >= 0)
5492 break;
5493 val = XCDR (val);
5496 if (c < 0)
5497 goto invalid_code;
5498 if (charset->id != charset_ascii
5499 && last_id != charset->id)
5501 if (last_id != charset_ascii)
5502 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5503 last_id = charset->id;
5504 last_offset = char_offset;
5507 *charbuf++ = c;
5508 char_offset++;
5509 continue;
5511 invalid_code:
5512 src = src_base;
5513 consumed_chars = consumed_chars_base;
5514 ONE_MORE_BYTE (c);
5515 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5516 char_offset++;
5517 coding->errors++;
5520 no_more_source:
5521 if (last_id != charset_ascii)
5522 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5523 coding->consumed_char += consumed_chars_base;
5524 coding->consumed = src_base - coding->source;
5525 coding->charbuf_used = charbuf - coding->charbuf;
5528 static bool
5529 encode_coding_charset (struct coding_system *coding)
5531 bool multibytep = coding->dst_multibyte;
5532 int *charbuf = coding->charbuf;
5533 int *charbuf_end = charbuf + coding->charbuf_used;
5534 unsigned char *dst = coding->destination + coding->produced;
5535 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5536 int safe_room = MAX_MULTIBYTE_LENGTH;
5537 ptrdiff_t produced_chars = 0;
5538 Lisp_Object attrs, charset_list;
5539 bool ascii_compatible;
5540 int c;
5542 CODING_GET_INFO (coding, attrs, charset_list);
5543 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5545 while (charbuf < charbuf_end)
5547 struct charset *charset;
5548 unsigned code;
5550 ASSURE_DESTINATION (safe_room);
5551 c = *charbuf++;
5552 if (ascii_compatible && ASCII_CHAR_P (c))
5553 EMIT_ONE_ASCII_BYTE (c);
5554 else if (CHAR_BYTE8_P (c))
5556 c = CHAR_TO_BYTE8 (c);
5557 EMIT_ONE_BYTE (c);
5559 else
5561 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5562 &code, charset);
5564 if (charset)
5566 if (CHARSET_DIMENSION (charset) == 1)
5567 EMIT_ONE_BYTE (code);
5568 else if (CHARSET_DIMENSION (charset) == 2)
5569 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5570 else if (CHARSET_DIMENSION (charset) == 3)
5571 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5572 else
5573 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5574 (code >> 8) & 0xFF, code & 0xFF);
5576 else
5578 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5579 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5580 else
5581 c = coding->default_char;
5582 EMIT_ONE_BYTE (c);
5587 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5588 coding->produced_char += produced_chars;
5589 coding->produced = dst - coding->destination;
5590 return 0;
5594 /*** 7. C library functions ***/
5596 /* Setup coding context CODING from information about CODING_SYSTEM.
5597 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5598 CODING_SYSTEM is invalid, signal an error. */
5600 void
5601 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5603 Lisp_Object attrs;
5604 Lisp_Object eol_type;
5605 Lisp_Object coding_type;
5606 Lisp_Object val;
5608 if (NILP (coding_system))
5609 coding_system = Qundecided;
5611 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5613 attrs = CODING_ID_ATTRS (coding->id);
5614 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5616 coding->mode = 0;
5617 coding->head_ascii = -1;
5618 if (VECTORP (eol_type))
5619 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5620 | CODING_REQUIRE_DETECTION_MASK);
5621 else if (! EQ (eol_type, Qunix))
5622 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5623 | CODING_REQUIRE_ENCODING_MASK);
5624 else
5625 coding->common_flags = 0;
5626 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5627 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5628 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5629 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5630 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5631 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5633 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5634 coding->max_charset_id = SCHARS (val) - 1;
5635 coding->safe_charsets = SDATA (val);
5636 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5637 coding->carryover_bytes = 0;
5639 coding_type = CODING_ATTR_TYPE (attrs);
5640 if (EQ (coding_type, Qundecided))
5642 coding->detector = NULL;
5643 coding->decoder = decode_coding_raw_text;
5644 coding->encoder = encode_coding_raw_text;
5645 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5647 else if (EQ (coding_type, Qiso_2022))
5649 int i;
5650 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5652 /* Invoke graphic register 0 to plane 0. */
5653 CODING_ISO_INVOCATION (coding, 0) = 0;
5654 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5655 CODING_ISO_INVOCATION (coding, 1)
5656 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5657 /* Setup the initial status of designation. */
5658 for (i = 0; i < 4; i++)
5659 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5660 /* Not single shifting initially. */
5661 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5662 /* Beginning of buffer should also be regarded as bol. */
5663 CODING_ISO_BOL (coding) = 1;
5664 coding->detector = detect_coding_iso_2022;
5665 coding->decoder = decode_coding_iso_2022;
5666 coding->encoder = encode_coding_iso_2022;
5667 if (flags & CODING_ISO_FLAG_SAFE)
5668 coding->mode |= CODING_MODE_SAFE_ENCODING;
5669 coding->common_flags
5670 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5671 | CODING_REQUIRE_FLUSHING_MASK);
5672 if (flags & CODING_ISO_FLAG_COMPOSITION)
5673 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5674 if (flags & CODING_ISO_FLAG_DESIGNATION)
5675 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5676 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5678 setup_iso_safe_charsets (attrs);
5679 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680 coding->max_charset_id = SCHARS (val) - 1;
5681 coding->safe_charsets = SDATA (val);
5683 CODING_ISO_FLAGS (coding) = flags;
5684 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5685 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5686 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5687 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5689 else if (EQ (coding_type, Qcharset))
5691 coding->detector = detect_coding_charset;
5692 coding->decoder = decode_coding_charset;
5693 coding->encoder = encode_coding_charset;
5694 coding->common_flags
5695 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5697 else if (EQ (coding_type, Qutf_8))
5699 val = AREF (attrs, coding_attr_utf_bom);
5700 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5701 : EQ (val, Qt) ? utf_with_bom
5702 : utf_without_bom);
5703 coding->detector = detect_coding_utf_8;
5704 coding->decoder = decode_coding_utf_8;
5705 coding->encoder = encode_coding_utf_8;
5706 coding->common_flags
5707 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5709 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5711 else if (EQ (coding_type, Qutf_16))
5713 val = AREF (attrs, coding_attr_utf_bom);
5714 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5715 : EQ (val, Qt) ? utf_with_bom
5716 : utf_without_bom);
5717 val = AREF (attrs, coding_attr_utf_16_endian);
5718 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5719 : utf_16_little_endian);
5720 CODING_UTF_16_SURROGATE (coding) = 0;
5721 coding->detector = detect_coding_utf_16;
5722 coding->decoder = decode_coding_utf_16;
5723 coding->encoder = encode_coding_utf_16;
5724 coding->common_flags
5725 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5726 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5727 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5729 else if (EQ (coding_type, Qccl))
5731 coding->detector = detect_coding_ccl;
5732 coding->decoder = decode_coding_ccl;
5733 coding->encoder = encode_coding_ccl;
5734 coding->common_flags
5735 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736 | CODING_REQUIRE_FLUSHING_MASK);
5738 else if (EQ (coding_type, Qemacs_mule))
5740 coding->detector = detect_coding_emacs_mule;
5741 coding->decoder = decode_coding_emacs_mule;
5742 coding->encoder = encode_coding_emacs_mule;
5743 coding->common_flags
5744 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5745 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5746 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5748 Lisp_Object tail, safe_charsets;
5749 int max_charset_id = 0;
5751 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5752 tail = XCDR (tail))
5753 if (max_charset_id < XFASTINT (XCAR (tail)))
5754 max_charset_id = XFASTINT (XCAR (tail));
5755 safe_charsets = make_uninit_string (max_charset_id + 1);
5756 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5757 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5758 tail = XCDR (tail))
5759 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5760 coding->max_charset_id = max_charset_id;
5761 coding->safe_charsets = SDATA (safe_charsets);
5763 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5764 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5766 else if (EQ (coding_type, Qshift_jis))
5768 coding->detector = detect_coding_sjis;
5769 coding->decoder = decode_coding_sjis;
5770 coding->encoder = encode_coding_sjis;
5771 coding->common_flags
5772 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5774 else if (EQ (coding_type, Qbig5))
5776 coding->detector = detect_coding_big5;
5777 coding->decoder = decode_coding_big5;
5778 coding->encoder = encode_coding_big5;
5779 coding->common_flags
5780 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5782 else /* EQ (coding_type, Qraw_text) */
5784 coding->detector = NULL;
5785 coding->decoder = decode_coding_raw_text;
5786 coding->encoder = encode_coding_raw_text;
5787 if (! EQ (eol_type, Qunix))
5789 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5790 if (! VECTORP (eol_type))
5791 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5796 return;
5799 /* Return a list of charsets supported by CODING. */
5801 Lisp_Object
5802 coding_charset_list (struct coding_system *coding)
5804 Lisp_Object attrs, charset_list;
5806 CODING_GET_INFO (coding, attrs, charset_list);
5807 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5809 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5811 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812 charset_list = Viso_2022_charset_list;
5814 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5816 charset_list = Vemacs_mule_charset_list;
5818 return charset_list;
5822 /* Return a list of charsets supported by CODING-SYSTEM. */
5824 Lisp_Object
5825 coding_system_charset_list (Lisp_Object coding_system)
5827 ptrdiff_t id;
5828 Lisp_Object attrs, charset_list;
5830 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5831 attrs = CODING_ID_ATTRS (id);
5833 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5835 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5837 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5838 charset_list = Viso_2022_charset_list;
5839 else
5840 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5842 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5844 charset_list = Vemacs_mule_charset_list;
5846 else
5848 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5850 return charset_list;
5854 /* Return raw-text or one of its subsidiaries that has the same
5855 eol_type as CODING-SYSTEM. */
5857 Lisp_Object
5858 raw_text_coding_system (Lisp_Object coding_system)
5860 Lisp_Object spec, attrs;
5861 Lisp_Object eol_type, raw_text_eol_type;
5863 if (NILP (coding_system))
5864 return Qraw_text;
5865 spec = CODING_SYSTEM_SPEC (coding_system);
5866 attrs = AREF (spec, 0);
5868 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5869 return coding_system;
5871 eol_type = AREF (spec, 2);
5872 if (VECTORP (eol_type))
5873 return Qraw_text;
5874 spec = CODING_SYSTEM_SPEC (Qraw_text);
5875 raw_text_eol_type = AREF (spec, 2);
5876 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5877 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5878 : AREF (raw_text_eol_type, 2));
5882 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5883 the subsidiary that has the same eol-spec as PARENT (if it is not
5884 nil and specifies end-of-line format) or the system's setting
5885 (system_eol_type). */
5887 Lisp_Object
5888 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5890 Lisp_Object spec, eol_type;
5892 if (NILP (coding_system))
5893 coding_system = Qraw_text;
5894 spec = CODING_SYSTEM_SPEC (coding_system);
5895 eol_type = AREF (spec, 2);
5896 if (VECTORP (eol_type))
5898 Lisp_Object parent_eol_type;
5900 if (! NILP (parent))
5902 Lisp_Object parent_spec;
5904 parent_spec = CODING_SYSTEM_SPEC (parent);
5905 parent_eol_type = AREF (parent_spec, 2);
5906 if (VECTORP (parent_eol_type))
5907 parent_eol_type = system_eol_type;
5909 else
5910 parent_eol_type = system_eol_type;
5911 if (EQ (parent_eol_type, Qunix))
5912 coding_system = AREF (eol_type, 0);
5913 else if (EQ (parent_eol_type, Qdos))
5914 coding_system = AREF (eol_type, 1);
5915 else if (EQ (parent_eol_type, Qmac))
5916 coding_system = AREF (eol_type, 2);
5918 return coding_system;
5922 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5923 decided for writing to a process. If not, complement them, and
5924 return a new coding system. */
5926 Lisp_Object
5927 complement_process_encoding_system (Lisp_Object coding_system)
5929 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5930 Lisp_Object spec, attrs;
5931 int i;
5933 for (i = 0; i < 3; i++)
5935 if (i == 1)
5936 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5937 else if (i == 2)
5938 coding_system = preferred_coding_system ();
5939 spec = CODING_SYSTEM_SPEC (coding_system);
5940 if (NILP (spec))
5941 continue;
5942 attrs = AREF (spec, 0);
5943 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5944 coding_base = CODING_ATTR_BASE_NAME (attrs);
5945 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5946 eol_base = coding_system;
5947 if (! NILP (coding_base) && ! NILP (eol_base))
5948 break;
5951 if (i > 0)
5952 /* The original CODING_SYSTEM didn't specify text-conversion or
5953 eol-conversion. Be sure that we return a fully complemented
5954 coding system. */
5955 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5956 return coding_system;
5960 /* Emacs has a mechanism to automatically detect a coding system if it
5961 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5962 it's impossible to distinguish some coding systems accurately
5963 because they use the same range of codes. So, at first, coding
5964 systems are categorized into 7, those are:
5966 o coding-category-emacs-mule
5968 The category for a coding system which has the same code range
5969 as Emacs' internal format. Assigned the coding-system (Lisp
5970 symbol) `emacs-mule' by default.
5972 o coding-category-sjis
5974 The category for a coding system which has the same code range
5975 as SJIS. Assigned the coding-system (Lisp
5976 symbol) `japanese-shift-jis' by default.
5978 o coding-category-iso-7
5980 The category for a coding system which has the same code range
5981 as ISO2022 of 7-bit environment. This doesn't use any locking
5982 shift and single shift functions. This can encode/decode all
5983 charsets. Assigned the coding-system (Lisp symbol)
5984 `iso-2022-7bit' by default.
5986 o coding-category-iso-7-tight
5988 Same as coding-category-iso-7 except that this can
5989 encode/decode only the specified charsets.
5991 o coding-category-iso-8-1
5993 The category for a coding system which has the same code range
5994 as ISO2022 of 8-bit environment and graphic plane 1 used only
5995 for DIMENSION1 charset. This doesn't use any locking shift
5996 and single shift functions. Assigned the coding-system (Lisp
5997 symbol) `iso-latin-1' by default.
5999 o coding-category-iso-8-2
6001 The category for a coding system which has the same code range
6002 as ISO2022 of 8-bit environment and graphic plane 1 used only
6003 for DIMENSION2 charset. This doesn't use any locking shift
6004 and single shift functions. Assigned the coding-system (Lisp
6005 symbol) `japanese-iso-8bit' by default.
6007 o coding-category-iso-7-else
6009 The category for a coding system which has the same code range
6010 as ISO2022 of 7-bit environment but uses locking shift or
6011 single shift functions. Assigned the coding-system (Lisp
6012 symbol) `iso-2022-7bit-lock' by default.
6014 o coding-category-iso-8-else
6016 The category for a coding system which has the same code range
6017 as ISO2022 of 8-bit environment but uses locking shift or
6018 single shift functions. Assigned the coding-system (Lisp
6019 symbol) `iso-2022-8bit-ss2' by default.
6021 o coding-category-big5
6023 The category for a coding system which has the same code range
6024 as BIG5. Assigned the coding-system (Lisp symbol)
6025 `cn-big5' by default.
6027 o coding-category-utf-8
6029 The category for a coding system which has the same code range
6030 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6031 symbol) `utf-8' by default.
6033 o coding-category-utf-16-be
6035 The category for a coding system in which a text has an
6036 Unicode signature (cf. Unicode Standard) in the order of BIG
6037 endian at the head. Assigned the coding-system (Lisp symbol)
6038 `utf-16-be' by default.
6040 o coding-category-utf-16-le
6042 The category for a coding system in which a text has an
6043 Unicode signature (cf. Unicode Standard) in the order of
6044 LITTLE endian at the head. Assigned the coding-system (Lisp
6045 symbol) `utf-16-le' by default.
6047 o coding-category-ccl
6049 The category for a coding system of which encoder/decoder is
6050 written in CCL programs. The default value is nil, i.e., no
6051 coding system is assigned.
6053 o coding-category-binary
6055 The category for a coding system not categorized in any of the
6056 above. Assigned the coding-system (Lisp symbol)
6057 `no-conversion' by default.
6059 Each of them is a Lisp symbol and the value is an actual
6060 `coding-system's (this is also a Lisp symbol) assigned by a user.
6061 What Emacs does actually is to detect a category of coding system.
6062 Then, it uses a `coding-system' assigned to it. If Emacs can't
6063 decide only one possible category, it selects a category of the
6064 highest priority. Priorities of categories are also specified by a
6065 user in a Lisp variable `coding-category-list'.
6069 #define EOL_SEEN_NONE 0
6070 #define EOL_SEEN_LF 1
6071 #define EOL_SEEN_CR 2
6072 #define EOL_SEEN_CRLF 4
6075 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6076 int eol_seen);
6079 /* Return the number of ASCII characters at the head of the source.
6080 By side effects, set coding->head_ascii and coding->eol_seen. The
6081 value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6082 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6083 all the source bytes are ASCII. */
6085 static int
6086 check_ascii (struct coding_system *coding)
6088 const unsigned char *src, *end;
6089 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6090 int eol_seen;
6092 eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6093 : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6094 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6095 : EOL_SEEN_CR);
6096 coding_set_source (coding);
6097 src = coding->source;
6098 end = src + coding->src_bytes;
6100 if (inhibit_eol_conversion
6101 || eol_seen != EOL_SEEN_NONE)
6103 /* We don't have to check EOL format. */
6104 while (src < end && !( *src & 0x80)) src++;
6105 if (inhibit_eol_conversion)
6107 eol_seen = EOL_SEEN_LF;
6108 adjust_coding_eol_type (coding, eol_seen);
6111 else
6113 end--; /* We look ahead one byte for "CR LF". */
6114 while (src < end)
6116 int c = *src;
6118 if (c & 0x80)
6119 break;
6120 src++;
6121 if (c == '\r')
6123 if (*src == '\n')
6125 eol_seen |= EOL_SEEN_CRLF;
6126 src++;
6128 else
6129 eol_seen |= EOL_SEEN_CR;
6131 else if (c == '\n')
6132 eol_seen |= EOL_SEEN_LF;
6134 if (src == end)
6136 int c = *src;
6138 /* All bytes but the last one C are ASCII. */
6139 if (! (c & 0x80))
6141 if (c == '\r')
6142 eol_seen |= EOL_SEEN_CR;
6143 else if (c == '\n')
6144 eol_seen |= EOL_SEEN_LF;
6145 src++;
6149 coding->head_ascii = src - coding->source;
6150 coding->eol_seen = eol_seen;
6151 return (coding->head_ascii);
6155 /* Return the number of characters at the source if all the bytes are
6156 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6157 effects, update coding->eol_seen. The value of coding->eol_seen is
6158 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6159 the value is reliable only when all the source bytes are valid
6160 UTF-8. */
6162 static int
6163 check_utf_8 (struct coding_system *coding)
6165 const unsigned char *src, *end;
6166 int eol_seen = coding->eol_seen;
6167 int nchars = coding->head_ascii;
6169 if (coding->head_ascii < 0)
6170 check_ascii (coding);
6171 else
6172 coding_set_source (coding);
6173 src = coding->source + coding->head_ascii;
6174 /* We look ahead one byte for CR LF. */
6175 end = coding->source + coding->src_bytes - 1;
6177 while (src < end)
6179 int c = *src;
6181 if (UTF_8_1_OCTET_P (*src))
6183 src++;
6184 if (c < 0x20)
6186 if (c == '\r')
6188 if (*src == '\n')
6190 eol_seen |= EOL_SEEN_CRLF;
6191 src++;
6192 nchars++;
6194 else
6195 eol_seen |= EOL_SEEN_CR;
6197 else if (c == '\n')
6198 eol_seen |= EOL_SEEN_LF;
6201 else if (UTF_8_2_OCTET_LEADING_P (c))
6203 if (c < 0xC2 /* overlong sequence */
6204 || src + 1 >= end
6205 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6206 return -1;
6207 src += 2;
6209 else if (UTF_8_3_OCTET_LEADING_P (c))
6211 if (src + 2 >= end
6212 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6213 && UTF_8_EXTRA_OCTET_P (src[2])))
6214 return -1;
6215 c = (((c & 0xF) << 12)
6216 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6217 if (c < 0x800 /* overlong sequence */
6218 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6219 return -1;
6220 src += 3;
6222 else if (UTF_8_4_OCTET_LEADING_P (c))
6224 if (src + 3 >= end
6225 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6226 && UTF_8_EXTRA_OCTET_P (src[2])
6227 && UTF_8_EXTRA_OCTET_P (src[3])))
6228 return -1;
6229 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6230 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6231 if (c < 0x10000 /* overlong sequence */
6232 || c >= 0x110000) /* non-Unicode character */
6233 return -1;
6234 src += 4;
6236 else
6237 return -1;
6238 nchars++;
6241 if (src == end)
6243 if (! UTF_8_1_OCTET_P (*src))
6244 return -1;
6245 nchars++;
6246 if (*src == '\r')
6247 eol_seen |= EOL_SEEN_CR;
6248 else if (*src == '\n')
6249 eol_seen |= EOL_SEEN_LF;
6251 coding->eol_seen = eol_seen;
6252 return nchars;
6256 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6257 SOURCE is encoded. If CATEGORY is one of
6258 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6259 two-byte, else they are encoded by one-byte.
6261 Return one of EOL_SEEN_XXX. */
6263 #define MAX_EOL_CHECK_COUNT 3
6265 static int
6266 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6267 enum coding_category category)
6269 const unsigned char *src = source, *src_end = src + src_bytes;
6270 unsigned char c;
6271 int total = 0;
6272 int eol_seen = EOL_SEEN_NONE;
6274 if ((1 << category) & CATEGORY_MASK_UTF_16)
6276 bool msb = category == (coding_category_utf_16_le
6277 | coding_category_utf_16_le_nosig);
6278 bool lsb = !msb;
6280 while (src + 1 < src_end)
6282 c = src[lsb];
6283 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6285 int this_eol;
6287 if (c == '\n')
6288 this_eol = EOL_SEEN_LF;
6289 else if (src + 3 >= src_end
6290 || src[msb + 2] != 0
6291 || src[lsb + 2] != '\n')
6292 this_eol = EOL_SEEN_CR;
6293 else
6295 this_eol = EOL_SEEN_CRLF;
6296 src += 2;
6299 if (eol_seen == EOL_SEEN_NONE)
6300 /* This is the first end-of-line. */
6301 eol_seen = this_eol;
6302 else if (eol_seen != this_eol)
6304 /* The found type is different from what found before.
6305 Allow for stray ^M characters in DOS EOL files. */
6306 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6307 || (eol_seen == EOL_SEEN_CRLF
6308 && this_eol == EOL_SEEN_CR))
6309 eol_seen = EOL_SEEN_CRLF;
6310 else
6312 eol_seen = EOL_SEEN_LF;
6313 break;
6316 if (++total == MAX_EOL_CHECK_COUNT)
6317 break;
6319 src += 2;
6322 else
6323 while (src < src_end)
6325 c = *src++;
6326 if (c == '\n' || c == '\r')
6328 int this_eol;
6330 if (c == '\n')
6331 this_eol = EOL_SEEN_LF;
6332 else if (src >= src_end || *src != '\n')
6333 this_eol = EOL_SEEN_CR;
6334 else
6335 this_eol = EOL_SEEN_CRLF, src++;
6337 if (eol_seen == EOL_SEEN_NONE)
6338 /* This is the first end-of-line. */
6339 eol_seen = this_eol;
6340 else if (eol_seen != this_eol)
6342 /* The found type is different from what found before.
6343 Allow for stray ^M characters in DOS EOL files. */
6344 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6345 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6346 eol_seen = EOL_SEEN_CRLF;
6347 else
6349 eol_seen = EOL_SEEN_LF;
6350 break;
6353 if (++total == MAX_EOL_CHECK_COUNT)
6354 break;
6357 return eol_seen;
6361 static Lisp_Object
6362 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6364 Lisp_Object eol_type;
6366 eol_type = CODING_ID_EOL_TYPE (coding->id);
6367 if (! VECTORP (eol_type))
6368 /* Already adjusted. */
6369 return eol_type;
6370 if (eol_seen & EOL_SEEN_LF)
6372 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6373 eol_type = Qunix;
6375 else if (eol_seen & EOL_SEEN_CRLF)
6377 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6378 eol_type = Qdos;
6380 else if (eol_seen & EOL_SEEN_CR)
6382 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6383 eol_type = Qmac;
6385 return eol_type;
6388 /* Detect how a text specified in CODING is encoded. If a coding
6389 system is detected, update fields of CODING by the detected coding
6390 system. */
6392 static void
6393 detect_coding (struct coding_system *coding)
6395 const unsigned char *src, *src_end;
6396 unsigned int saved_mode = coding->mode;
6398 coding->consumed = coding->consumed_char = 0;
6399 coding->produced = coding->produced_char = 0;
6400 coding_set_source (coding);
6402 src_end = coding->source + coding->src_bytes;
6404 /* If we have not yet decided the text encoding type, detect it
6405 now. */
6406 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6408 int c, i;
6409 struct coding_detection_info detect_info;
6410 bool null_byte_found = 0, eight_bit_found = 0;
6412 coding->head_ascii = 0;
6413 coding->eol_seen = EOL_SEEN_NONE;
6414 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6415 for (src = coding->source; src < src_end; src++)
6417 c = *src;
6418 if (c & 0x80)
6420 eight_bit_found = 1;
6421 if (null_byte_found)
6422 break;
6424 else if (c < 0x20)
6426 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6427 && ! inhibit_iso_escape_detection
6428 && ! detect_info.checked)
6430 if (detect_coding_iso_2022 (coding, &detect_info))
6432 /* We have scanned the whole data. */
6433 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6435 /* We didn't find an 8-bit code. We may
6436 have found a null-byte, but it's very
6437 rare that a binary file conforms to
6438 ISO-2022. */
6439 src = src_end;
6440 coding->head_ascii = src - coding->source;
6442 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6443 break;
6446 else if (! c && !inhibit_null_byte_detection)
6448 null_byte_found = 1;
6449 if (eight_bit_found)
6450 break;
6452 else if (! disable_ascii_optimization
6453 && ! inhibit_eol_conversion)
6455 if (c == '\r')
6457 if (src < src_end && src[1] == '\n')
6459 coding->eol_seen |= EOL_SEEN_CRLF;
6460 src++;
6461 if (! eight_bit_found)
6462 coding->head_ascii++;
6464 else
6465 coding->eol_seen |= EOL_SEEN_CR;
6467 else if (c == '\n')
6469 coding->eol_seen |= EOL_SEEN_LF;
6473 if (! eight_bit_found)
6474 coding->head_ascii++;
6476 else if (! eight_bit_found)
6477 coding->head_ascii++;
6480 if (null_byte_found || eight_bit_found
6481 || coding->head_ascii < coding->src_bytes
6482 || detect_info.found)
6484 enum coding_category category;
6485 struct coding_system *this;
6487 if (coding->head_ascii == coding->src_bytes)
6488 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6489 for (i = 0; i < coding_category_raw_text; i++)
6491 category = coding_priorities[i];
6492 this = coding_categories + category;
6493 if (detect_info.found & (1 << category))
6494 break;
6496 else
6498 if (null_byte_found)
6500 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6501 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6503 for (i = 0; i < coding_category_raw_text; i++)
6505 category = coding_priorities[i];
6506 this = coding_categories + category;
6507 /* Some of this->detector (e.g. detect_coding_sjis)
6508 require this information. */
6509 coding->id = this->id;
6510 if (this->id < 0)
6512 /* No coding system of this category is defined. */
6513 detect_info.rejected |= (1 << category);
6515 else if (category >= coding_category_raw_text)
6516 continue;
6517 else if (detect_info.checked & (1 << category))
6519 if (detect_info.found & (1 << category))
6520 break;
6522 else if ((*(this->detector)) (coding, &detect_info)
6523 && detect_info.found & (1 << category))
6525 if (category == coding_category_utf_16_auto)
6527 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6528 category = coding_category_utf_16_le;
6529 else
6530 category = coding_category_utf_16_be;
6532 break;
6537 if (i < coding_category_raw_text)
6538 setup_coding_system (CODING_ID_NAME (this->id), coding);
6539 else if (null_byte_found)
6540 setup_coding_system (Qno_conversion, coding);
6541 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6542 == CATEGORY_MASK_ANY)
6543 setup_coding_system (Qraw_text, coding);
6544 else if (detect_info.rejected)
6545 for (i = 0; i < coding_category_raw_text; i++)
6546 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6548 this = coding_categories + coding_priorities[i];
6549 setup_coding_system (CODING_ID_NAME (this->id), coding);
6550 break;
6554 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6555 == coding_category_utf_8_auto)
6557 Lisp_Object coding_systems;
6558 struct coding_detection_info detect_info;
6560 coding_systems
6561 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6562 detect_info.found = detect_info.rejected = 0;
6563 if (check_ascii (coding) == coding->src_bytes)
6565 int head_ascii = coding->head_ascii;
6567 if (coding->eol_seen != EOL_SEEN_NONE)
6568 adjust_coding_eol_type (coding, coding->eol_seen);
6569 setup_coding_system (XCDR (coding_systems), coding);
6570 coding->head_ascii = head_ascii;
6572 else
6574 if (CONSP (coding_systems)
6575 && detect_coding_utf_8 (coding, &detect_info))
6577 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6578 setup_coding_system (XCAR (coding_systems), coding);
6579 else
6580 setup_coding_system (XCDR (coding_systems), coding);
6584 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6585 == coding_category_utf_16_auto)
6587 Lisp_Object coding_systems;
6588 struct coding_detection_info detect_info;
6590 coding_systems
6591 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6592 detect_info.found = detect_info.rejected = 0;
6593 coding->head_ascii = 0;
6594 coding->eol_seen = EOL_SEEN_NONE;
6595 if (CONSP (coding_systems)
6596 && detect_coding_utf_16 (coding, &detect_info))
6598 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6599 setup_coding_system (XCAR (coding_systems), coding);
6600 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6601 setup_coding_system (XCDR (coding_systems), coding);
6604 coding->mode = saved_mode;
6608 static void
6609 decode_eol (struct coding_system *coding)
6611 Lisp_Object eol_type;
6612 unsigned char *p, *pbeg, *pend;
6614 eol_type = CODING_ID_EOL_TYPE (coding->id);
6615 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6616 return;
6618 if (NILP (coding->dst_object))
6619 pbeg = coding->destination;
6620 else
6621 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6622 pend = pbeg + coding->produced;
6624 if (VECTORP (eol_type))
6626 int eol_seen = EOL_SEEN_NONE;
6628 for (p = pbeg; p < pend; p++)
6630 if (*p == '\n')
6631 eol_seen |= EOL_SEEN_LF;
6632 else if (*p == '\r')
6634 if (p + 1 < pend && *(p + 1) == '\n')
6636 eol_seen |= EOL_SEEN_CRLF;
6637 p++;
6639 else
6640 eol_seen |= EOL_SEEN_CR;
6643 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6644 if ((eol_seen & EOL_SEEN_CRLF) != 0
6645 && (eol_seen & EOL_SEEN_CR) != 0
6646 && (eol_seen & EOL_SEEN_LF) == 0)
6647 eol_seen = EOL_SEEN_CRLF;
6648 else if (eol_seen != EOL_SEEN_NONE
6649 && eol_seen != EOL_SEEN_LF
6650 && eol_seen != EOL_SEEN_CRLF
6651 && eol_seen != EOL_SEEN_CR)
6652 eol_seen = EOL_SEEN_LF;
6653 if (eol_seen != EOL_SEEN_NONE)
6654 eol_type = adjust_coding_eol_type (coding, eol_seen);
6657 if (EQ (eol_type, Qmac))
6659 for (p = pbeg; p < pend; p++)
6660 if (*p == '\r')
6661 *p = '\n';
6663 else if (EQ (eol_type, Qdos))
6665 ptrdiff_t n = 0;
6667 if (NILP (coding->dst_object))
6669 /* Start deleting '\r' from the tail to minimize the memory
6670 movement. */
6671 for (p = pend - 2; p >= pbeg; p--)
6672 if (*p == '\r')
6674 memmove (p, p + 1, pend-- - p - 1);
6675 n++;
6678 else
6680 ptrdiff_t pos_byte = coding->dst_pos_byte;
6681 ptrdiff_t pos = coding->dst_pos;
6682 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6684 while (pos < pos_end)
6686 p = BYTE_POS_ADDR (pos_byte);
6687 if (*p == '\r' && p[1] == '\n')
6689 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6690 n++;
6691 pos_end--;
6693 pos++;
6694 if (coding->dst_multibyte)
6695 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6696 else
6697 pos_byte++;
6700 coding->produced -= n;
6701 coding->produced_char -= n;
6706 /* Return a translation table (or list of them) from coding system
6707 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6708 not ENCODEP). */
6710 static Lisp_Object
6711 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6713 Lisp_Object standard, translation_table;
6714 Lisp_Object val;
6716 if (NILP (Venable_character_translation))
6718 if (max_lookup)
6719 *max_lookup = 0;
6720 return Qnil;
6722 if (encodep)
6723 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6724 standard = Vstandard_translation_table_for_encode;
6725 else
6726 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6727 standard = Vstandard_translation_table_for_decode;
6728 if (NILP (translation_table))
6729 translation_table = standard;
6730 else
6732 if (SYMBOLP (translation_table))
6733 translation_table = Fget (translation_table, Qtranslation_table);
6734 else if (CONSP (translation_table))
6736 translation_table = Fcopy_sequence (translation_table);
6737 for (val = translation_table; CONSP (val); val = XCDR (val))
6738 if (SYMBOLP (XCAR (val)))
6739 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6741 if (CHAR_TABLE_P (standard))
6743 if (CONSP (translation_table))
6744 translation_table = nconc2 (translation_table,
6745 Fcons (standard, Qnil));
6746 else
6747 translation_table = Fcons (translation_table,
6748 Fcons (standard, Qnil));
6752 if (max_lookup)
6754 *max_lookup = 1;
6755 if (CHAR_TABLE_P (translation_table)
6756 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6758 val = XCHAR_TABLE (translation_table)->extras[1];
6759 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6760 *max_lookup = XFASTINT (val);
6762 else if (CONSP (translation_table))
6764 Lisp_Object tail;
6766 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6767 if (CHAR_TABLE_P (XCAR (tail))
6768 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6770 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6771 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6772 *max_lookup = XFASTINT (tailval);
6776 return translation_table;
6779 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6780 do { \
6781 trans = Qnil; \
6782 if (CHAR_TABLE_P (table)) \
6784 trans = CHAR_TABLE_REF (table, c); \
6785 if (CHARACTERP (trans)) \
6786 c = XFASTINT (trans), trans = Qnil; \
6788 else if (CONSP (table)) \
6790 Lisp_Object tail; \
6792 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6793 if (CHAR_TABLE_P (XCAR (tail))) \
6795 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6796 if (CHARACTERP (trans)) \
6797 c = XFASTINT (trans), trans = Qnil; \
6798 else if (! NILP (trans)) \
6799 break; \
6802 } while (0)
6805 /* Return a translation of character(s) at BUF according to TRANS.
6806 TRANS is TO-CHAR or ((FROM . TO) ...) where
6807 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6808 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6809 translation is found, and Qnil if not found..
6810 If BUF is too short to lookup characters in FROM, return Qt. */
6812 static Lisp_Object
6813 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6816 if (INTEGERP (trans))
6817 return trans;
6818 for (; CONSP (trans); trans = XCDR (trans))
6820 Lisp_Object val = XCAR (trans);
6821 Lisp_Object from = XCAR (val);
6822 ptrdiff_t len = ASIZE (from);
6823 ptrdiff_t i;
6825 for (i = 0; i < len; i++)
6827 if (buf + i == buf_end)
6828 return Qt;
6829 if (XINT (AREF (from, i)) != buf[i])
6830 break;
6832 if (i == len)
6833 return val;
6835 return Qnil;
6839 static int
6840 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6841 bool last_block)
6843 unsigned char *dst = coding->destination + coding->produced;
6844 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6845 ptrdiff_t produced;
6846 ptrdiff_t produced_chars = 0;
6847 int carryover = 0;
6849 if (! coding->chars_at_source)
6851 /* Source characters are in coding->charbuf. */
6852 int *buf = coding->charbuf;
6853 int *buf_end = buf + coding->charbuf_used;
6855 if (EQ (coding->src_object, coding->dst_object))
6857 coding_set_source (coding);
6858 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6861 while (buf < buf_end)
6863 int c = *buf;
6864 ptrdiff_t i;
6866 if (c >= 0)
6868 ptrdiff_t from_nchars = 1, to_nchars = 1;
6869 Lisp_Object trans = Qnil;
6871 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6872 if (! NILP (trans))
6874 trans = get_translation (trans, buf, buf_end);
6875 if (INTEGERP (trans))
6876 c = XINT (trans);
6877 else if (CONSP (trans))
6879 from_nchars = ASIZE (XCAR (trans));
6880 trans = XCDR (trans);
6881 if (INTEGERP (trans))
6882 c = XINT (trans);
6883 else
6885 to_nchars = ASIZE (trans);
6886 c = XINT (AREF (trans, 0));
6889 else if (EQ (trans, Qt) && ! last_block)
6890 break;
6893 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6895 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6896 / MAX_MULTIBYTE_LENGTH)
6897 < to_nchars)
6898 memory_full (SIZE_MAX);
6899 dst = alloc_destination (coding,
6900 buf_end - buf
6901 + MAX_MULTIBYTE_LENGTH * to_nchars,
6902 dst);
6903 if (EQ (coding->src_object, coding->dst_object))
6905 coding_set_source (coding);
6906 dst_end = (((unsigned char *) coding->source)
6907 + coding->consumed);
6909 else
6910 dst_end = coding->destination + coding->dst_bytes;
6913 for (i = 0; i < to_nchars; i++)
6915 if (i > 0)
6916 c = XINT (AREF (trans, i));
6917 if (coding->dst_multibyte
6918 || ! CHAR_BYTE8_P (c))
6919 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6920 else
6921 *dst++ = CHAR_TO_BYTE8 (c);
6923 produced_chars += to_nchars;
6924 buf += from_nchars;
6926 else
6927 /* This is an annotation datum. (-C) is the length. */
6928 buf += -c;
6930 carryover = buf_end - buf;
6932 else
6934 /* Source characters are at coding->source. */
6935 const unsigned char *src = coding->source;
6936 const unsigned char *src_end = src + coding->consumed;
6938 if (EQ (coding->dst_object, coding->src_object))
6939 dst_end = (unsigned char *) src;
6940 if (coding->src_multibyte != coding->dst_multibyte)
6942 if (coding->src_multibyte)
6944 bool multibytep = 1;
6945 ptrdiff_t consumed_chars = 0;
6947 while (1)
6949 const unsigned char *src_base = src;
6950 int c;
6952 ONE_MORE_BYTE (c);
6953 if (dst == dst_end)
6955 if (EQ (coding->src_object, coding->dst_object))
6956 dst_end = (unsigned char *) src;
6957 if (dst == dst_end)
6959 ptrdiff_t offset = src - coding->source;
6961 dst = alloc_destination (coding, src_end - src + 1,
6962 dst);
6963 dst_end = coding->destination + coding->dst_bytes;
6964 coding_set_source (coding);
6965 src = coding->source + offset;
6966 src_end = coding->source + coding->consumed;
6967 if (EQ (coding->src_object, coding->dst_object))
6968 dst_end = (unsigned char *) src;
6971 *dst++ = c;
6972 produced_chars++;
6974 no_more_source:
6977 else
6978 while (src < src_end)
6980 bool multibytep = 1;
6981 int c = *src++;
6983 if (dst >= dst_end - 1)
6985 if (EQ (coding->src_object, coding->dst_object))
6986 dst_end = (unsigned char *) src;
6987 if (dst >= dst_end - 1)
6989 ptrdiff_t offset = src - coding->source;
6990 ptrdiff_t more_bytes;
6992 if (EQ (coding->src_object, coding->dst_object))
6993 more_bytes = ((src_end - src) / 2) + 2;
6994 else
6995 more_bytes = src_end - src + 2;
6996 dst = alloc_destination (coding, more_bytes, dst);
6997 dst_end = coding->destination + coding->dst_bytes;
6998 coding_set_source (coding);
6999 src = coding->source + offset;
7000 src_end = coding->source + coding->consumed;
7001 if (EQ (coding->src_object, coding->dst_object))
7002 dst_end = (unsigned char *) src;
7005 EMIT_ONE_BYTE (c);
7008 else
7010 if (!EQ (coding->src_object, coding->dst_object))
7012 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7014 if (require > 0)
7016 ptrdiff_t offset = src - coding->source;
7018 dst = alloc_destination (coding, require, dst);
7019 coding_set_source (coding);
7020 src = coding->source + offset;
7021 src_end = coding->source + coding->consumed;
7024 produced_chars = coding->consumed_char;
7025 while (src < src_end)
7026 *dst++ = *src++;
7030 produced = dst - (coding->destination + coding->produced);
7031 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7032 insert_from_gap (produced_chars, produced, 0);
7033 coding->produced += produced;
7034 coding->produced_char += produced_chars;
7035 return carryover;
7038 /* Compose text in CODING->object according to the annotation data at
7039 CHARBUF. CHARBUF is an array:
7040 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7043 static void
7044 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7046 int len;
7047 ptrdiff_t to;
7048 enum composition_method method;
7049 Lisp_Object components;
7051 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7052 to = pos + charbuf[2];
7053 method = (enum composition_method) (charbuf[4]);
7055 if (method == COMPOSITION_RELATIVE)
7056 components = Qnil;
7057 else
7059 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7060 int i, j;
7062 if (method == COMPOSITION_WITH_RULE)
7063 len = charbuf[2] * 3 - 2;
7064 charbuf += MAX_ANNOTATION_LENGTH;
7065 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7066 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7068 if (charbuf[i] >= 0)
7069 args[j] = make_number (charbuf[i]);
7070 else
7072 i++;
7073 args[j] = make_number (charbuf[i] % 0x100);
7076 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7078 compose_text (pos, to, components, Qnil, coding->dst_object);
7082 /* Put `charset' property on text in CODING->object according to
7083 the annotation data at CHARBUF. CHARBUF is an array:
7084 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7087 static void
7088 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7090 ptrdiff_t from = pos - charbuf[2];
7091 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7093 Fput_text_property (make_number (from), make_number (pos),
7094 Qcharset, CHARSET_NAME (charset),
7095 coding->dst_object);
7099 #define CHARBUF_SIZE 0x4000
7101 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7102 do { \
7103 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7104 coding->charbuf_size = CHARBUF_SIZE; \
7105 } while (0)
7108 static void
7109 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7111 int *charbuf = coding->charbuf;
7112 int *charbuf_end = charbuf + coding->charbuf_used;
7114 if (NILP (coding->dst_object))
7115 return;
7117 while (charbuf < charbuf_end)
7119 if (*charbuf >= 0)
7120 pos++, charbuf++;
7121 else
7123 int len = -*charbuf;
7125 if (len > 2)
7126 switch (charbuf[1])
7128 case CODING_ANNOTATE_COMPOSITION_MASK:
7129 produce_composition (coding, charbuf, pos);
7130 break;
7131 case CODING_ANNOTATE_CHARSET_MASK:
7132 produce_charset (coding, charbuf, pos);
7133 break;
7135 charbuf += len;
7140 /* Decode the data at CODING->src_object into CODING->dst_object.
7141 CODING->src_object is a buffer, a string, or nil.
7142 CODING->dst_object is a buffer.
7144 If CODING->src_object is a buffer, it must be the current buffer.
7145 In this case, if CODING->src_pos is positive, it is a position of
7146 the source text in the buffer, otherwise, the source text is in the
7147 gap area of the buffer, and CODING->src_pos specifies the offset of
7148 the text from GPT (which must be the same as PT). If this is the
7149 same buffer as CODING->dst_object, CODING->src_pos must be
7150 negative.
7152 If CODING->src_object is a string, CODING->src_pos is an index to
7153 that string.
7155 If CODING->src_object is nil, CODING->source must already point to
7156 the non-relocatable memory area. In this case, CODING->src_pos is
7157 an offset from CODING->source.
7159 The decoded data is inserted at the current point of the buffer
7160 CODING->dst_object.
7163 static void
7164 decode_coding (struct coding_system *coding)
7166 Lisp_Object attrs;
7167 Lisp_Object undo_list;
7168 Lisp_Object translation_table;
7169 struct ccl_spec cclspec;
7170 int carryover;
7171 int i;
7173 USE_SAFE_ALLOCA;
7175 if (BUFFERP (coding->src_object)
7176 && coding->src_pos > 0
7177 && coding->src_pos < GPT
7178 && coding->src_pos + coding->src_chars > GPT)
7179 move_gap_both (coding->src_pos, coding->src_pos_byte);
7181 undo_list = Qt;
7182 if (BUFFERP (coding->dst_object))
7184 set_buffer_internal (XBUFFER (coding->dst_object));
7185 if (GPT != PT)
7186 move_gap_both (PT, PT_BYTE);
7188 /* We must disable undo_list in order to record the whole insert
7189 transaction via record_insert at the end. But doing so also
7190 disables the recording of the first change to the undo_list.
7191 Therefore we check for first change here and record it via
7192 record_first_change if needed. */
7193 if (MODIFF <= SAVE_MODIFF)
7194 record_first_change ();
7196 undo_list = BVAR (current_buffer, undo_list);
7197 bset_undo_list (current_buffer, Qt);
7200 coding->consumed = coding->consumed_char = 0;
7201 coding->produced = coding->produced_char = 0;
7202 coding->chars_at_source = 0;
7203 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7204 coding->errors = 0;
7206 ALLOC_CONVERSION_WORK_AREA (coding);
7208 attrs = CODING_ID_ATTRS (coding->id);
7209 translation_table = get_translation_table (attrs, 0, NULL);
7211 carryover = 0;
7212 if (coding->decoder == decode_coding_ccl)
7214 coding->spec.ccl = &cclspec;
7215 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7219 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7221 coding_set_source (coding);
7222 coding->annotated = 0;
7223 coding->charbuf_used = carryover;
7224 (*(coding->decoder)) (coding);
7225 coding_set_destination (coding);
7226 carryover = produce_chars (coding, translation_table, 0);
7227 if (coding->annotated)
7228 produce_annotation (coding, pos);
7229 for (i = 0; i < carryover; i++)
7230 coding->charbuf[i]
7231 = coding->charbuf[coding->charbuf_used - carryover + i];
7233 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7234 || (coding->consumed < coding->src_bytes
7235 && (coding->result == CODING_RESULT_SUCCESS
7236 || coding->result == CODING_RESULT_INVALID_SRC)));
7238 if (carryover > 0)
7240 coding_set_destination (coding);
7241 coding->charbuf_used = carryover;
7242 produce_chars (coding, translation_table, 1);
7245 coding->carryover_bytes = 0;
7246 if (coding->consumed < coding->src_bytes)
7248 int nbytes = coding->src_bytes - coding->consumed;
7249 const unsigned char *src;
7251 coding_set_source (coding);
7252 coding_set_destination (coding);
7253 src = coding->source + coding->consumed;
7255 if (coding->mode & CODING_MODE_LAST_BLOCK)
7257 /* Flush out unprocessed data as binary chars. We are sure
7258 that the number of data is less than the size of
7259 coding->charbuf. */
7260 coding->charbuf_used = 0;
7261 coding->chars_at_source = 0;
7263 while (nbytes-- > 0)
7265 int c = *src++;
7267 if (c & 0x80)
7268 c = BYTE8_TO_CHAR (c);
7269 coding->charbuf[coding->charbuf_used++] = c;
7271 produce_chars (coding, Qnil, 1);
7273 else
7275 /* Record unprocessed bytes in coding->carryover. We are
7276 sure that the number of data is less than the size of
7277 coding->carryover. */
7278 unsigned char *p = coding->carryover;
7280 if (nbytes > sizeof coding->carryover)
7281 nbytes = sizeof coding->carryover;
7282 coding->carryover_bytes = nbytes;
7283 while (nbytes-- > 0)
7284 *p++ = *src++;
7286 coding->consumed = coding->src_bytes;
7289 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7290 && !inhibit_eol_conversion)
7291 decode_eol (coding);
7292 if (BUFFERP (coding->dst_object))
7294 bset_undo_list (current_buffer, undo_list);
7295 record_insert (coding->dst_pos, coding->produced_char);
7298 SAFE_FREE ();
7302 /* Extract an annotation datum from a composition starting at POS and
7303 ending before LIMIT of CODING->src_object (buffer or string), store
7304 the data in BUF, set *STOP to a starting position of the next
7305 composition (if any) or to LIMIT, and return the address of the
7306 next element of BUF.
7308 If such an annotation is not found, set *STOP to a starting
7309 position of a composition after POS (if any) or to LIMIT, and
7310 return BUF. */
7312 static int *
7313 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7314 struct coding_system *coding, int *buf,
7315 ptrdiff_t *stop)
7317 ptrdiff_t start, end;
7318 Lisp_Object prop;
7320 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7321 || end > limit)
7322 *stop = limit;
7323 else if (start > pos)
7324 *stop = start;
7325 else
7327 if (start == pos)
7329 /* We found a composition. Store the corresponding
7330 annotation data in BUF. */
7331 int *head = buf;
7332 enum composition_method method = COMPOSITION_METHOD (prop);
7333 int nchars = COMPOSITION_LENGTH (prop);
7335 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7336 if (method != COMPOSITION_RELATIVE)
7338 Lisp_Object components;
7339 ptrdiff_t i, len, i_byte;
7341 components = COMPOSITION_COMPONENTS (prop);
7342 if (VECTORP (components))
7344 len = ASIZE (components);
7345 for (i = 0; i < len; i++)
7346 *buf++ = XINT (AREF (components, i));
7348 else if (STRINGP (components))
7350 len = SCHARS (components);
7351 i = i_byte = 0;
7352 while (i < len)
7354 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7355 buf++;
7358 else if (INTEGERP (components))
7360 len = 1;
7361 *buf++ = XINT (components);
7363 else if (CONSP (components))
7365 for (len = 0; CONSP (components);
7366 len++, components = XCDR (components))
7367 *buf++ = XINT (XCAR (components));
7369 else
7370 emacs_abort ();
7371 *head -= len;
7375 if (find_composition (end, limit, &start, &end, &prop,
7376 coding->src_object)
7377 && end <= limit)
7378 *stop = start;
7379 else
7380 *stop = limit;
7382 return buf;
7386 /* Extract an annotation datum from a text property `charset' at POS of
7387 CODING->src_object (buffer of string), store the data in BUF, set
7388 *STOP to the position where the value of `charset' property changes
7389 (limiting by LIMIT), and return the address of the next element of
7390 BUF.
7392 If the property value is nil, set *STOP to the position where the
7393 property value is non-nil (limiting by LIMIT), and return BUF. */
7395 static int *
7396 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7397 struct coding_system *coding, int *buf,
7398 ptrdiff_t *stop)
7400 Lisp_Object val, next;
7401 int id;
7403 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7404 if (! NILP (val) && CHARSETP (val))
7405 id = XINT (CHARSET_SYMBOL_ID (val));
7406 else
7407 id = -1;
7408 ADD_CHARSET_DATA (buf, 0, id);
7409 next = Fnext_single_property_change (make_number (pos), Qcharset,
7410 coding->src_object,
7411 make_number (limit));
7412 *stop = XINT (next);
7413 return buf;
7417 static void
7418 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7419 int max_lookup)
7421 int *buf = coding->charbuf;
7422 int *buf_end = coding->charbuf + coding->charbuf_size;
7423 const unsigned char *src = coding->source + coding->consumed;
7424 const unsigned char *src_end = coding->source + coding->src_bytes;
7425 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7426 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7427 bool multibytep = coding->src_multibyte;
7428 Lisp_Object eol_type;
7429 int c;
7430 ptrdiff_t stop, stop_composition, stop_charset;
7431 int *lookup_buf = NULL;
7433 if (! NILP (translation_table))
7434 lookup_buf = alloca (sizeof (int) * max_lookup);
7436 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7437 if (VECTORP (eol_type))
7438 eol_type = Qunix;
7440 /* Note: composition handling is not yet implemented. */
7441 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7443 if (NILP (coding->src_object))
7444 stop = stop_composition = stop_charset = end_pos;
7445 else
7447 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7448 stop = stop_composition = pos;
7449 else
7450 stop = stop_composition = end_pos;
7451 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7452 stop = stop_charset = pos;
7453 else
7454 stop_charset = end_pos;
7457 /* Compensate for CRLF and conversion. */
7458 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7459 while (buf < buf_end)
7461 Lisp_Object trans;
7463 if (pos == stop)
7465 if (pos == end_pos)
7466 break;
7467 if (pos == stop_composition)
7468 buf = handle_composition_annotation (pos, end_pos, coding,
7469 buf, &stop_composition);
7470 if (pos == stop_charset)
7471 buf = handle_charset_annotation (pos, end_pos, coding,
7472 buf, &stop_charset);
7473 stop = (stop_composition < stop_charset
7474 ? stop_composition : stop_charset);
7477 if (! multibytep)
7479 int bytes;
7481 if (coding->encoder == encode_coding_raw_text
7482 || coding->encoder == encode_coding_ccl)
7483 c = *src++, pos++;
7484 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7485 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7486 else
7487 c = BYTE8_TO_CHAR (*src), src++, pos++;
7489 else
7490 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7491 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7492 c = '\n';
7493 if (! EQ (eol_type, Qunix))
7495 if (c == '\n')
7497 if (EQ (eol_type, Qdos))
7498 *buf++ = '\r';
7499 else
7500 c = '\r';
7504 trans = Qnil;
7505 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7506 if (NILP (trans))
7507 *buf++ = c;
7508 else
7510 ptrdiff_t from_nchars = 1, to_nchars = 1;
7511 int *lookup_buf_end;
7512 const unsigned char *p = src;
7513 int i;
7515 lookup_buf[0] = c;
7516 for (i = 1; i < max_lookup && p < src_end; i++)
7517 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7518 lookup_buf_end = lookup_buf + i;
7519 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7520 if (INTEGERP (trans))
7521 c = XINT (trans);
7522 else if (CONSP (trans))
7524 from_nchars = ASIZE (XCAR (trans));
7525 trans = XCDR (trans);
7526 if (INTEGERP (trans))
7527 c = XINT (trans);
7528 else
7530 to_nchars = ASIZE (trans);
7531 if (buf_end - buf < to_nchars)
7532 break;
7533 c = XINT (AREF (trans, 0));
7536 else
7537 break;
7538 *buf++ = c;
7539 for (i = 1; i < to_nchars; i++)
7540 *buf++ = XINT (AREF (trans, i));
7541 for (i = 1; i < from_nchars; i++, pos++)
7542 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7546 coding->consumed = src - coding->source;
7547 coding->consumed_char = pos - coding->src_pos;
7548 coding->charbuf_used = buf - coding->charbuf;
7549 coding->chars_at_source = 0;
7553 /* Encode the text at CODING->src_object into CODING->dst_object.
7554 CODING->src_object is a buffer or a string.
7555 CODING->dst_object is a buffer or nil.
7557 If CODING->src_object is a buffer, it must be the current buffer.
7558 In this case, if CODING->src_pos is positive, it is a position of
7559 the source text in the buffer, otherwise. the source text is in the
7560 gap area of the buffer, and coding->src_pos specifies the offset of
7561 the text from GPT (which must be the same as PT). If this is the
7562 same buffer as CODING->dst_object, CODING->src_pos must be
7563 negative and CODING should not have `pre-write-conversion'.
7565 If CODING->src_object is a string, CODING should not have
7566 `pre-write-conversion'.
7568 If CODING->dst_object is a buffer, the encoded data is inserted at
7569 the current point of that buffer.
7571 If CODING->dst_object is nil, the encoded data is placed at the
7572 memory area specified by CODING->destination. */
7574 static void
7575 encode_coding (struct coding_system *coding)
7577 Lisp_Object attrs;
7578 Lisp_Object translation_table;
7579 int max_lookup;
7580 struct ccl_spec cclspec;
7582 USE_SAFE_ALLOCA;
7584 attrs = CODING_ID_ATTRS (coding->id);
7585 if (coding->encoder == encode_coding_raw_text)
7586 translation_table = Qnil, max_lookup = 0;
7587 else
7588 translation_table = get_translation_table (attrs, 1, &max_lookup);
7590 if (BUFFERP (coding->dst_object))
7592 set_buffer_internal (XBUFFER (coding->dst_object));
7593 coding->dst_multibyte
7594 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7597 coding->consumed = coding->consumed_char = 0;
7598 coding->produced = coding->produced_char = 0;
7599 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7600 coding->errors = 0;
7602 ALLOC_CONVERSION_WORK_AREA (coding);
7604 if (coding->encoder == encode_coding_ccl)
7606 coding->spec.ccl = &cclspec;
7607 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7609 do {
7610 coding_set_source (coding);
7611 consume_chars (coding, translation_table, max_lookup);
7612 coding_set_destination (coding);
7613 (*(coding->encoder)) (coding);
7614 } while (coding->consumed_char < coding->src_chars);
7616 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7617 insert_from_gap (coding->produced_char, coding->produced, 0);
7619 SAFE_FREE ();
7623 /* Name (or base name) of work buffer for code conversion. */
7624 static Lisp_Object Vcode_conversion_workbuf_name;
7626 /* A working buffer used by the top level conversion. Once it is
7627 created, it is never destroyed. It has the name
7628 Vcode_conversion_workbuf_name. The other working buffers are
7629 destroyed after the use is finished, and their names are modified
7630 versions of Vcode_conversion_workbuf_name. */
7631 static Lisp_Object Vcode_conversion_reused_workbuf;
7633 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7634 static bool reused_workbuf_in_use;
7637 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7638 multibyteness of returning buffer. */
7640 static Lisp_Object
7641 make_conversion_work_buffer (bool multibyte)
7643 Lisp_Object name, workbuf;
7644 struct buffer *current;
7646 if (reused_workbuf_in_use)
7648 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7649 workbuf = Fget_buffer_create (name);
7651 else
7653 reused_workbuf_in_use = 1;
7654 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7655 Vcode_conversion_reused_workbuf
7656 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7657 workbuf = Vcode_conversion_reused_workbuf;
7659 current = current_buffer;
7660 set_buffer_internal (XBUFFER (workbuf));
7661 /* We can't allow modification hooks to run in the work buffer. For
7662 instance, directory_files_internal assumes that file decoding
7663 doesn't compile new regexps. */
7664 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7665 Ferase_buffer ();
7666 bset_undo_list (current_buffer, Qt);
7667 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7668 set_buffer_internal (current);
7669 return workbuf;
7673 static Lisp_Object
7674 code_conversion_restore (Lisp_Object arg)
7676 Lisp_Object current, workbuf;
7677 struct gcpro gcpro1;
7679 GCPRO1 (arg);
7680 current = XCAR (arg);
7681 workbuf = XCDR (arg);
7682 if (! NILP (workbuf))
7684 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7685 reused_workbuf_in_use = 0;
7686 else
7687 Fkill_buffer (workbuf);
7689 set_buffer_internal (XBUFFER (current));
7690 UNGCPRO;
7691 return Qnil;
7694 Lisp_Object
7695 code_conversion_save (bool with_work_buf, bool multibyte)
7697 Lisp_Object workbuf = Qnil;
7699 if (with_work_buf)
7700 workbuf = make_conversion_work_buffer (multibyte);
7701 record_unwind_protect (code_conversion_restore,
7702 Fcons (Fcurrent_buffer (), workbuf));
7703 return workbuf;
7706 void
7707 decode_coding_gap (struct coding_system *coding,
7708 ptrdiff_t chars, ptrdiff_t bytes)
7710 ptrdiff_t count = SPECPDL_INDEX ();
7711 Lisp_Object attrs;
7713 coding->src_object = Fcurrent_buffer ();
7714 coding->src_chars = chars;
7715 coding->src_bytes = bytes;
7716 coding->src_pos = -chars;
7717 coding->src_pos_byte = -bytes;
7718 coding->src_multibyte = chars < bytes;
7719 coding->dst_object = coding->src_object;
7720 coding->dst_pos = PT;
7721 coding->dst_pos_byte = PT_BYTE;
7722 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7724 if (CODING_REQUIRE_DETECTION (coding))
7725 detect_coding (coding);
7726 attrs = CODING_ID_ATTRS (coding->id);
7727 if (! disable_ascii_optimization
7728 && ! coding->src_multibyte
7729 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7730 && NILP (CODING_ATTR_POST_READ (attrs))
7731 && NILP (get_translation_table (attrs, 0, NULL)))
7733 chars = coding->head_ascii;
7734 if (chars < 0)
7735 chars = check_ascii (coding);
7736 if (chars != bytes)
7738 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7739 chars = check_utf_8 (coding);
7740 else
7741 chars = -1;
7743 if (chars >= 0)
7745 if (coding->eol_seen != EOL_SEEN_NONE)
7746 adjust_coding_eol_type (coding, coding->eol_seen);
7748 if (coding->eol_seen == EOL_SEEN_CR)
7750 unsigned char *src_end = GAP_END_ADDR;
7751 unsigned char *src = src_end - coding->src_bytes;
7753 while (src < src_end)
7755 if (*src++ == '\r')
7756 src[-1] = '\n';
7759 else if (coding->eol_seen == EOL_SEEN_CRLF)
7761 unsigned char *src = GAP_END_ADDR;
7762 unsigned char *src_beg = src - coding->src_bytes;
7763 unsigned char *dst = src;
7764 ptrdiff_t diff;
7766 while (src_beg < src)
7768 *--dst = *--src;
7769 if (*src == '\n')
7770 src--;
7772 diff = dst - src;
7773 bytes -= diff;
7774 chars -= diff;
7776 coding->produced = bytes;
7777 coding->produced_char = chars;
7778 insert_from_gap (chars, bytes, 1);
7779 return;
7782 code_conversion_save (0, 0);
7784 coding->mode |= CODING_MODE_LAST_BLOCK;
7785 current_buffer->text->inhibit_shrinking = 1;
7786 decode_coding (coding);
7787 current_buffer->text->inhibit_shrinking = 0;
7789 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7791 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7792 Lisp_Object val;
7794 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7795 val = call1 (CODING_ATTR_POST_READ (attrs),
7796 make_number (coding->produced_char));
7797 CHECK_NATNUM (val);
7798 coding->produced_char += Z - prev_Z;
7799 coding->produced += Z_BYTE - prev_Z_BYTE;
7802 unbind_to (count, Qnil);
7806 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7807 SRC_OBJECT into DST_OBJECT by coding context CODING.
7809 SRC_OBJECT is a buffer, a string, or Qnil.
7811 If it is a buffer, the text is at point of the buffer. FROM and TO
7812 are positions in the buffer.
7814 If it is a string, the text is at the beginning of the string.
7815 FROM and TO are indices to the string.
7817 If it is nil, the text is at coding->source. FROM and TO are
7818 indices to coding->source.
7820 DST_OBJECT is a buffer, Qt, or Qnil.
7822 If it is a buffer, the decoded text is inserted at point of the
7823 buffer. If the buffer is the same as SRC_OBJECT, the source text
7824 is deleted.
7826 If it is Qt, a string is made from the decoded text, and
7827 set in CODING->dst_object.
7829 If it is Qnil, the decoded text is stored at CODING->destination.
7830 The caller must allocate CODING->dst_bytes bytes at
7831 CODING->destination by xmalloc. If the decoded text is longer than
7832 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7835 void
7836 decode_coding_object (struct coding_system *coding,
7837 Lisp_Object src_object,
7838 ptrdiff_t from, ptrdiff_t from_byte,
7839 ptrdiff_t to, ptrdiff_t to_byte,
7840 Lisp_Object dst_object)
7842 ptrdiff_t count = SPECPDL_INDEX ();
7843 unsigned char *destination IF_LINT (= NULL);
7844 ptrdiff_t dst_bytes IF_LINT (= 0);
7845 ptrdiff_t chars = to - from;
7846 ptrdiff_t bytes = to_byte - from_byte;
7847 Lisp_Object attrs;
7848 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7849 bool need_marker_adjustment = 0;
7850 Lisp_Object old_deactivate_mark;
7852 old_deactivate_mark = Vdeactivate_mark;
7854 if (NILP (dst_object))
7856 destination = coding->destination;
7857 dst_bytes = coding->dst_bytes;
7860 coding->src_object = src_object;
7861 coding->src_chars = chars;
7862 coding->src_bytes = bytes;
7863 coding->src_multibyte = chars < bytes;
7865 if (STRINGP (src_object))
7867 coding->src_pos = from;
7868 coding->src_pos_byte = from_byte;
7870 else if (BUFFERP (src_object))
7872 set_buffer_internal (XBUFFER (src_object));
7873 if (from != GPT)
7874 move_gap_both (from, from_byte);
7875 if (EQ (src_object, dst_object))
7877 struct Lisp_Marker *tail;
7879 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7881 tail->need_adjustment
7882 = tail->charpos == (tail->insertion_type ? from : to);
7883 need_marker_adjustment |= tail->need_adjustment;
7885 saved_pt = PT, saved_pt_byte = PT_BYTE;
7886 TEMP_SET_PT_BOTH (from, from_byte);
7887 current_buffer->text->inhibit_shrinking = 1;
7888 del_range_both (from, from_byte, to, to_byte, 1);
7889 coding->src_pos = -chars;
7890 coding->src_pos_byte = -bytes;
7892 else
7894 coding->src_pos = from;
7895 coding->src_pos_byte = from_byte;
7899 if (CODING_REQUIRE_DETECTION (coding))
7900 detect_coding (coding);
7901 attrs = CODING_ID_ATTRS (coding->id);
7903 if (EQ (dst_object, Qt)
7904 || (! NILP (CODING_ATTR_POST_READ (attrs))
7905 && NILP (dst_object)))
7907 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7908 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7909 coding->dst_pos = BEG;
7910 coding->dst_pos_byte = BEG_BYTE;
7912 else if (BUFFERP (dst_object))
7914 code_conversion_save (0, 0);
7915 coding->dst_object = dst_object;
7916 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7917 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7918 coding->dst_multibyte
7919 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7921 else
7923 code_conversion_save (0, 0);
7924 coding->dst_object = Qnil;
7925 /* Most callers presume this will return a multibyte result, and they
7926 won't use `binary' or `raw-text' anyway, so let's not worry about
7927 CODING_FOR_UNIBYTE. */
7928 coding->dst_multibyte = 1;
7931 decode_coding (coding);
7933 if (BUFFERP (coding->dst_object))
7934 set_buffer_internal (XBUFFER (coding->dst_object));
7936 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7938 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7939 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7940 Lisp_Object val;
7942 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7943 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7944 old_deactivate_mark);
7945 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7946 make_number (coding->produced_char));
7947 UNGCPRO;
7948 CHECK_NATNUM (val);
7949 coding->produced_char += Z - prev_Z;
7950 coding->produced += Z_BYTE - prev_Z_BYTE;
7953 if (EQ (dst_object, Qt))
7955 coding->dst_object = Fbuffer_string ();
7957 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7959 set_buffer_internal (XBUFFER (coding->dst_object));
7960 if (dst_bytes < coding->produced)
7962 eassert (coding->produced > 0);
7963 destination = xrealloc (destination, coding->produced);
7964 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7965 move_gap_both (BEGV, BEGV_BYTE);
7966 memcpy (destination, BEGV_ADDR, coding->produced);
7967 coding->destination = destination;
7971 if (saved_pt >= 0)
7973 /* This is the case of:
7974 (BUFFERP (src_object) && EQ (src_object, dst_object))
7975 As we have moved PT while replacing the original buffer
7976 contents, we must recover it now. */
7977 set_buffer_internal (XBUFFER (src_object));
7978 current_buffer->text->inhibit_shrinking = 0;
7979 if (saved_pt < from)
7980 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7981 else if (saved_pt < from + chars)
7982 TEMP_SET_PT_BOTH (from, from_byte);
7983 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7984 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7985 saved_pt_byte + (coding->produced - bytes));
7986 else
7987 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7988 saved_pt_byte + (coding->produced - bytes));
7990 if (need_marker_adjustment)
7992 struct Lisp_Marker *tail;
7994 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7995 if (tail->need_adjustment)
7997 tail->need_adjustment = 0;
7998 if (tail->insertion_type)
8000 tail->bytepos = from_byte;
8001 tail->charpos = from;
8003 else
8005 tail->bytepos = from_byte + coding->produced;
8006 tail->charpos
8007 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8008 ? tail->bytepos : from + coding->produced_char);
8014 Vdeactivate_mark = old_deactivate_mark;
8015 unbind_to (count, coding->dst_object);
8019 void
8020 encode_coding_object (struct coding_system *coding,
8021 Lisp_Object src_object,
8022 ptrdiff_t from, ptrdiff_t from_byte,
8023 ptrdiff_t to, ptrdiff_t to_byte,
8024 Lisp_Object dst_object)
8026 ptrdiff_t count = SPECPDL_INDEX ();
8027 ptrdiff_t chars = to - from;
8028 ptrdiff_t bytes = to_byte - from_byte;
8029 Lisp_Object attrs;
8030 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8031 bool need_marker_adjustment = 0;
8032 bool kill_src_buffer = 0;
8033 Lisp_Object old_deactivate_mark;
8035 old_deactivate_mark = Vdeactivate_mark;
8037 coding->src_object = src_object;
8038 coding->src_chars = chars;
8039 coding->src_bytes = bytes;
8040 coding->src_multibyte = chars < bytes;
8042 attrs = CODING_ID_ATTRS (coding->id);
8044 if (EQ (src_object, dst_object))
8046 struct Lisp_Marker *tail;
8048 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8050 tail->need_adjustment
8051 = tail->charpos == (tail->insertion_type ? from : to);
8052 need_marker_adjustment |= tail->need_adjustment;
8056 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8058 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8059 set_buffer_internal (XBUFFER (coding->src_object));
8060 if (STRINGP (src_object))
8061 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8062 else if (BUFFERP (src_object))
8063 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8064 else
8065 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8067 if (EQ (src_object, dst_object))
8069 set_buffer_internal (XBUFFER (src_object));
8070 saved_pt = PT, saved_pt_byte = PT_BYTE;
8071 del_range_both (from, from_byte, to, to_byte, 1);
8072 set_buffer_internal (XBUFFER (coding->src_object));
8076 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8078 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8079 old_deactivate_mark);
8080 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8081 make_number (BEG), make_number (Z));
8082 UNGCPRO;
8084 if (XBUFFER (coding->src_object) != current_buffer)
8085 kill_src_buffer = 1;
8086 coding->src_object = Fcurrent_buffer ();
8087 if (BEG != GPT)
8088 move_gap_both (BEG, BEG_BYTE);
8089 coding->src_chars = Z - BEG;
8090 coding->src_bytes = Z_BYTE - BEG_BYTE;
8091 coding->src_pos = BEG;
8092 coding->src_pos_byte = BEG_BYTE;
8093 coding->src_multibyte = Z < Z_BYTE;
8095 else if (STRINGP (src_object))
8097 code_conversion_save (0, 0);
8098 coding->src_pos = from;
8099 coding->src_pos_byte = from_byte;
8101 else if (BUFFERP (src_object))
8103 code_conversion_save (0, 0);
8104 set_buffer_internal (XBUFFER (src_object));
8105 if (EQ (src_object, dst_object))
8107 saved_pt = PT, saved_pt_byte = PT_BYTE;
8108 coding->src_object = del_range_1 (from, to, 1, 1);
8109 coding->src_pos = 0;
8110 coding->src_pos_byte = 0;
8112 else
8114 if (from < GPT && to >= GPT)
8115 move_gap_both (from, from_byte);
8116 coding->src_pos = from;
8117 coding->src_pos_byte = from_byte;
8120 else
8121 code_conversion_save (0, 0);
8123 if (BUFFERP (dst_object))
8125 coding->dst_object = dst_object;
8126 if (EQ (src_object, dst_object))
8128 coding->dst_pos = from;
8129 coding->dst_pos_byte = from_byte;
8131 else
8133 struct buffer *current = current_buffer;
8135 set_buffer_temp (XBUFFER (dst_object));
8136 coding->dst_pos = PT;
8137 coding->dst_pos_byte = PT_BYTE;
8138 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8139 set_buffer_temp (current);
8141 coding->dst_multibyte
8142 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8144 else if (EQ (dst_object, Qt))
8146 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8147 coding->dst_object = Qnil;
8148 coding->destination = xmalloc (dst_bytes);
8149 coding->dst_bytes = dst_bytes;
8150 coding->dst_multibyte = 0;
8152 else
8154 coding->dst_object = Qnil;
8155 coding->dst_multibyte = 0;
8158 encode_coding (coding);
8160 if (EQ (dst_object, Qt))
8162 if (BUFFERP (coding->dst_object))
8163 coding->dst_object = Fbuffer_string ();
8164 else
8166 coding->dst_object
8167 = make_unibyte_string ((char *) coding->destination,
8168 coding->produced);
8169 xfree (coding->destination);
8173 if (saved_pt >= 0)
8175 /* This is the case of:
8176 (BUFFERP (src_object) && EQ (src_object, dst_object))
8177 As we have moved PT while replacing the original buffer
8178 contents, we must recover it now. */
8179 set_buffer_internal (XBUFFER (src_object));
8180 if (saved_pt < from)
8181 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8182 else if (saved_pt < from + chars)
8183 TEMP_SET_PT_BOTH (from, from_byte);
8184 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8185 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8186 saved_pt_byte + (coding->produced - bytes));
8187 else
8188 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8189 saved_pt_byte + (coding->produced - bytes));
8191 if (need_marker_adjustment)
8193 struct Lisp_Marker *tail;
8195 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8196 if (tail->need_adjustment)
8198 tail->need_adjustment = 0;
8199 if (tail->insertion_type)
8201 tail->bytepos = from_byte;
8202 tail->charpos = from;
8204 else
8206 tail->bytepos = from_byte + coding->produced;
8207 tail->charpos
8208 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8209 ? tail->bytepos : from + coding->produced_char);
8215 if (kill_src_buffer)
8216 Fkill_buffer (coding->src_object);
8218 Vdeactivate_mark = old_deactivate_mark;
8219 unbind_to (count, Qnil);
8223 Lisp_Object
8224 preferred_coding_system (void)
8226 int id = coding_categories[coding_priorities[0]].id;
8228 return CODING_ID_NAME (id);
8231 #if defined (WINDOWSNT) || defined (CYGWIN)
8233 Lisp_Object
8234 from_unicode (Lisp_Object str)
8236 CHECK_STRING (str);
8237 if (!STRING_MULTIBYTE (str) &&
8238 SBYTES (str) & 1)
8240 str = Fsubstring (str, make_number (0), make_number (-1));
8243 return code_convert_string_norecord (str, Qutf_16le, 0);
8246 Lisp_Object
8247 from_unicode_buffer (const wchar_t* wstr)
8249 return from_unicode (
8250 make_unibyte_string (
8251 (char*) wstr,
8252 /* we get one of the two final 0 bytes for free. */
8253 1 + sizeof (wchar_t) * wcslen (wstr)));
8256 wchar_t *
8257 to_unicode (Lisp_Object str, Lisp_Object *buf)
8259 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8260 /* We need to make another copy (in addition to the one made by
8261 code_convert_string_norecord) to ensure that the final string is
8262 _doubly_ zero terminated --- that is, that the string is
8263 terminated by two zero bytes and one utf-16le null character.
8264 Because strings are already terminated with a single zero byte,
8265 we just add one additional zero. */
8266 str = make_uninit_string (SBYTES (*buf) + 1);
8267 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8268 SDATA (str) [SBYTES (*buf)] = '\0';
8269 *buf = str;
8270 return WCSDATA (*buf);
8273 #endif /* WINDOWSNT || CYGWIN */
8276 #ifdef emacs
8277 /*** 8. Emacs Lisp library functions ***/
8279 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8280 doc: /* Return t if OBJECT is nil or a coding-system.
8281 See the documentation of `define-coding-system' for information
8282 about coding-system objects. */)
8283 (Lisp_Object object)
8285 if (NILP (object)
8286 || CODING_SYSTEM_ID (object) >= 0)
8287 return Qt;
8288 if (! SYMBOLP (object)
8289 || NILP (Fget (object, Qcoding_system_define_form)))
8290 return Qnil;
8291 return Qt;
8294 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8295 Sread_non_nil_coding_system, 1, 1, 0,
8296 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8297 (Lisp_Object prompt)
8299 Lisp_Object val;
8302 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8303 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8305 while (SCHARS (val) == 0);
8306 return (Fintern (val, Qnil));
8309 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8310 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8311 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8312 Ignores case when completing coding systems (all Emacs coding systems
8313 are lower-case). */)
8314 (Lisp_Object prompt, Lisp_Object default_coding_system)
8316 Lisp_Object val;
8317 ptrdiff_t count = SPECPDL_INDEX ();
8319 if (SYMBOLP (default_coding_system))
8320 default_coding_system = SYMBOL_NAME (default_coding_system);
8321 specbind (Qcompletion_ignore_case, Qt);
8322 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8323 Qt, Qnil, Qcoding_system_history,
8324 default_coding_system, Qnil);
8325 unbind_to (count, Qnil);
8326 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8329 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8330 1, 1, 0,
8331 doc: /* Check validity of CODING-SYSTEM.
8332 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8333 It is valid if it is nil or a symbol defined as a coding system by the
8334 function `define-coding-system'. */)
8335 (Lisp_Object coding_system)
8337 Lisp_Object define_form;
8339 define_form = Fget (coding_system, Qcoding_system_define_form);
8340 if (! NILP (define_form))
8342 Fput (coding_system, Qcoding_system_define_form, Qnil);
8343 safe_eval (define_form);
8345 if (!NILP (Fcoding_system_p (coding_system)))
8346 return coding_system;
8347 xsignal1 (Qcoding_system_error, coding_system);
8351 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8352 HIGHEST, return the coding system of the highest
8353 priority among the detected coding systems. Otherwise return a
8354 list of detected coding systems sorted by their priorities. If
8355 MULTIBYTEP, it is assumed that the bytes are in correct
8356 multibyte form but contains only ASCII and eight-bit chars.
8357 Otherwise, the bytes are raw bytes.
8359 CODING-SYSTEM controls the detection as below:
8361 If it is nil, detect both text-format and eol-format. If the
8362 text-format part of CODING-SYSTEM is already specified
8363 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8364 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8365 detect only text-format. */
8367 Lisp_Object
8368 detect_coding_system (const unsigned char *src,
8369 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8370 bool highest, bool multibytep,
8371 Lisp_Object coding_system)
8373 const unsigned char *src_end = src + src_bytes;
8374 Lisp_Object attrs, eol_type;
8375 Lisp_Object val = Qnil;
8376 struct coding_system coding;
8377 ptrdiff_t id;
8378 struct coding_detection_info detect_info;
8379 enum coding_category base_category;
8380 bool null_byte_found = 0, eight_bit_found = 0;
8382 if (NILP (coding_system))
8383 coding_system = Qundecided;
8384 setup_coding_system (coding_system, &coding);
8385 attrs = CODING_ID_ATTRS (coding.id);
8386 eol_type = CODING_ID_EOL_TYPE (coding.id);
8387 coding_system = CODING_ATTR_BASE_NAME (attrs);
8389 coding.source = src;
8390 coding.src_chars = src_chars;
8391 coding.src_bytes = src_bytes;
8392 coding.src_multibyte = multibytep;
8393 coding.consumed = 0;
8394 coding.mode |= CODING_MODE_LAST_BLOCK;
8395 coding.head_ascii = 0;
8397 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8399 /* At first, detect text-format if necessary. */
8400 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8401 if (base_category == coding_category_undecided)
8403 enum coding_category category IF_LINT (= 0);
8404 struct coding_system *this IF_LINT (= NULL);
8405 int c, i;
8407 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8408 for (; src < src_end; src++)
8410 c = *src;
8411 if (c & 0x80)
8413 eight_bit_found = 1;
8414 if (null_byte_found)
8415 break;
8417 else if (c < 0x20)
8419 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8420 && ! inhibit_iso_escape_detection
8421 && ! detect_info.checked)
8423 if (detect_coding_iso_2022 (&coding, &detect_info))
8425 /* We have scanned the whole data. */
8426 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8428 /* We didn't find an 8-bit code. We may
8429 have found a null-byte, but it's very
8430 rare that a binary file confirm to
8431 ISO-2022. */
8432 src = src_end;
8433 coding.head_ascii = src - coding.source;
8435 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8436 break;
8439 else if (! c && !inhibit_null_byte_detection)
8441 null_byte_found = 1;
8442 if (eight_bit_found)
8443 break;
8445 if (! eight_bit_found)
8446 coding.head_ascii++;
8448 else if (! eight_bit_found)
8449 coding.head_ascii++;
8452 if (null_byte_found || eight_bit_found
8453 || coding.head_ascii < coding.src_bytes
8454 || detect_info.found)
8456 if (coding.head_ascii == coding.src_bytes)
8457 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8458 for (i = 0; i < coding_category_raw_text; i++)
8460 category = coding_priorities[i];
8461 this = coding_categories + category;
8462 if (detect_info.found & (1 << category))
8463 break;
8465 else
8467 if (null_byte_found)
8469 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8470 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8472 for (i = 0; i < coding_category_raw_text; i++)
8474 category = coding_priorities[i];
8475 this = coding_categories + category;
8477 if (this->id < 0)
8479 /* No coding system of this category is defined. */
8480 detect_info.rejected |= (1 << category);
8482 else if (category >= coding_category_raw_text)
8483 continue;
8484 else if (detect_info.checked & (1 << category))
8486 if (highest
8487 && (detect_info.found & (1 << category)))
8488 break;
8490 else if ((*(this->detector)) (&coding, &detect_info)
8491 && highest
8492 && (detect_info.found & (1 << category)))
8494 if (category == coding_category_utf_16_auto)
8496 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8497 category = coding_category_utf_16_le;
8498 else
8499 category = coding_category_utf_16_be;
8501 break;
8507 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8508 || null_byte_found)
8510 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8511 id = CODING_SYSTEM_ID (Qno_conversion);
8512 val = Fcons (make_number (id), Qnil);
8514 else if (! detect_info.rejected && ! detect_info.found)
8516 detect_info.found = CATEGORY_MASK_ANY;
8517 id = coding_categories[coding_category_undecided].id;
8518 val = Fcons (make_number (id), Qnil);
8520 else if (highest)
8522 if (detect_info.found)
8524 detect_info.found = 1 << category;
8525 val = Fcons (make_number (this->id), Qnil);
8527 else
8528 for (i = 0; i < coding_category_raw_text; i++)
8529 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8531 detect_info.found = 1 << coding_priorities[i];
8532 id = coding_categories[coding_priorities[i]].id;
8533 val = Fcons (make_number (id), Qnil);
8534 break;
8537 else
8539 int mask = detect_info.rejected | detect_info.found;
8540 int found = 0;
8542 for (i = coding_category_raw_text - 1; i >= 0; i--)
8544 category = coding_priorities[i];
8545 if (! (mask & (1 << category)))
8547 found |= 1 << category;
8548 id = coding_categories[category].id;
8549 if (id >= 0)
8550 val = Fcons (make_number (id), val);
8553 for (i = coding_category_raw_text - 1; i >= 0; i--)
8555 category = coding_priorities[i];
8556 if (detect_info.found & (1 << category))
8558 id = coding_categories[category].id;
8559 val = Fcons (make_number (id), val);
8562 detect_info.found |= found;
8565 else if (base_category == coding_category_utf_8_auto)
8567 if (detect_coding_utf_8 (&coding, &detect_info))
8569 struct coding_system *this;
8571 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8572 this = coding_categories + coding_category_utf_8_sig;
8573 else
8574 this = coding_categories + coding_category_utf_8_nosig;
8575 val = Fcons (make_number (this->id), Qnil);
8578 else if (base_category == coding_category_utf_16_auto)
8580 if (detect_coding_utf_16 (&coding, &detect_info))
8582 struct coding_system *this;
8584 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8585 this = coding_categories + coding_category_utf_16_le;
8586 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8587 this = coding_categories + coding_category_utf_16_be;
8588 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8589 this = coding_categories + coding_category_utf_16_be_nosig;
8590 else
8591 this = coding_categories + coding_category_utf_16_le_nosig;
8592 val = Fcons (make_number (this->id), Qnil);
8595 else
8597 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8598 val = Fcons (make_number (coding.id), Qnil);
8601 /* Then, detect eol-format if necessary. */
8603 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8604 Lisp_Object tail;
8606 if (VECTORP (eol_type))
8608 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8610 if (null_byte_found)
8611 normal_eol = EOL_SEEN_LF;
8612 else
8613 normal_eol = detect_eol (coding.source, src_bytes,
8614 coding_category_raw_text);
8616 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8617 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8618 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8619 coding_category_utf_16_be);
8620 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8621 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8622 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8623 coding_category_utf_16_le);
8625 else
8627 if (EQ (eol_type, Qunix))
8628 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8629 else if (EQ (eol_type, Qdos))
8630 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8631 else
8632 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8635 for (tail = val; CONSP (tail); tail = XCDR (tail))
8637 enum coding_category category;
8638 int this_eol;
8640 id = XINT (XCAR (tail));
8641 attrs = CODING_ID_ATTRS (id);
8642 category = XINT (CODING_ATTR_CATEGORY (attrs));
8643 eol_type = CODING_ID_EOL_TYPE (id);
8644 if (VECTORP (eol_type))
8646 if (category == coding_category_utf_16_be
8647 || category == coding_category_utf_16_be_nosig)
8648 this_eol = utf_16_be_eol;
8649 else if (category == coding_category_utf_16_le
8650 || category == coding_category_utf_16_le_nosig)
8651 this_eol = utf_16_le_eol;
8652 else
8653 this_eol = normal_eol;
8655 if (this_eol == EOL_SEEN_LF)
8656 XSETCAR (tail, AREF (eol_type, 0));
8657 else if (this_eol == EOL_SEEN_CRLF)
8658 XSETCAR (tail, AREF (eol_type, 1));
8659 else if (this_eol == EOL_SEEN_CR)
8660 XSETCAR (tail, AREF (eol_type, 2));
8661 else
8662 XSETCAR (tail, CODING_ID_NAME (id));
8664 else
8665 XSETCAR (tail, CODING_ID_NAME (id));
8669 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8673 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8674 2, 3, 0,
8675 doc: /* Detect coding system of the text in the region between START and END.
8676 Return a list of possible coding systems ordered by priority.
8677 The coding systems to try and their priorities follows what
8678 the function `coding-system-priority-list' (which see) returns.
8680 If only ASCII characters are found (except for such ISO-2022 control
8681 characters as ESC), it returns a list of single element `undecided'
8682 or its subsidiary coding system according to a detected end-of-line
8683 format.
8685 If optional argument HIGHEST is non-nil, return the coding system of
8686 highest priority. */)
8687 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8689 ptrdiff_t from, to;
8690 ptrdiff_t from_byte, to_byte;
8692 validate_region (&start, &end);
8693 from = XINT (start), to = XINT (end);
8694 from_byte = CHAR_TO_BYTE (from);
8695 to_byte = CHAR_TO_BYTE (to);
8697 if (from < GPT && to >= GPT)
8698 move_gap_both (to, to_byte);
8700 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8701 to - from, to_byte - from_byte,
8702 !NILP (highest),
8703 !NILP (BVAR (current_buffer
8704 , enable_multibyte_characters)),
8705 Qnil);
8708 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8709 1, 2, 0,
8710 doc: /* Detect coding system of the text in STRING.
8711 Return a list of possible coding systems ordered by priority.
8712 The coding systems to try and their priorities follows what
8713 the function `coding-system-priority-list' (which see) returns.
8715 If only ASCII characters are found (except for such ISO-2022 control
8716 characters as ESC), it returns a list of single element `undecided'
8717 or its subsidiary coding system according to a detected end-of-line
8718 format.
8720 If optional argument HIGHEST is non-nil, return the coding system of
8721 highest priority. */)
8722 (Lisp_Object string, Lisp_Object highest)
8724 CHECK_STRING (string);
8726 return detect_coding_system (SDATA (string),
8727 SCHARS (string), SBYTES (string),
8728 !NILP (highest), STRING_MULTIBYTE (string),
8729 Qnil);
8733 static bool
8734 char_encodable_p (int c, Lisp_Object attrs)
8736 Lisp_Object tail;
8737 struct charset *charset;
8738 Lisp_Object translation_table;
8740 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8741 if (! NILP (translation_table))
8742 c = translate_char (translation_table, c);
8743 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8744 CONSP (tail); tail = XCDR (tail))
8746 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8747 if (CHAR_CHARSET_P (c, charset))
8748 break;
8750 return (! NILP (tail));
8754 /* Return a list of coding systems that safely encode the text between
8755 START and END. If EXCLUDE is non-nil, it is a list of coding
8756 systems not to check. The returned list doesn't contain any such
8757 coding systems. In any case, if the text contains only ASCII or is
8758 unibyte, return t. */
8760 DEFUN ("find-coding-systems-region-internal",
8761 Ffind_coding_systems_region_internal,
8762 Sfind_coding_systems_region_internal, 2, 3, 0,
8763 doc: /* Internal use only. */)
8764 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8766 Lisp_Object coding_attrs_list, safe_codings;
8767 ptrdiff_t start_byte, end_byte;
8768 const unsigned char *p, *pbeg, *pend;
8769 int c;
8770 Lisp_Object tail, elt, work_table;
8772 if (STRINGP (start))
8774 if (!STRING_MULTIBYTE (start)
8775 || SCHARS (start) == SBYTES (start))
8776 return Qt;
8777 start_byte = 0;
8778 end_byte = SBYTES (start);
8780 else
8782 CHECK_NUMBER_COERCE_MARKER (start);
8783 CHECK_NUMBER_COERCE_MARKER (end);
8784 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8785 args_out_of_range (start, end);
8786 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8787 return Qt;
8788 start_byte = CHAR_TO_BYTE (XINT (start));
8789 end_byte = CHAR_TO_BYTE (XINT (end));
8790 if (XINT (end) - XINT (start) == end_byte - start_byte)
8791 return Qt;
8793 if (XINT (start) < GPT && XINT (end) > GPT)
8795 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8796 move_gap_both (XINT (start), start_byte);
8797 else
8798 move_gap_both (XINT (end), end_byte);
8802 coding_attrs_list = Qnil;
8803 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8804 if (NILP (exclude)
8805 || NILP (Fmemq (XCAR (tail), exclude)))
8807 Lisp_Object attrs;
8809 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8810 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8811 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8813 ASET (attrs, coding_attr_trans_tbl,
8814 get_translation_table (attrs, 1, NULL));
8815 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8819 if (STRINGP (start))
8820 p = pbeg = SDATA (start);
8821 else
8822 p = pbeg = BYTE_POS_ADDR (start_byte);
8823 pend = p + (end_byte - start_byte);
8825 while (p < pend && ASCII_BYTE_P (*p)) p++;
8826 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8828 work_table = Fmake_char_table (Qnil, Qnil);
8829 while (p < pend)
8831 if (ASCII_BYTE_P (*p))
8832 p++;
8833 else
8835 c = STRING_CHAR_ADVANCE (p);
8836 if (!NILP (char_table_ref (work_table, c)))
8837 /* This character was already checked. Ignore it. */
8838 continue;
8840 charset_map_loaded = 0;
8841 for (tail = coding_attrs_list; CONSP (tail);)
8843 elt = XCAR (tail);
8844 if (NILP (elt))
8845 tail = XCDR (tail);
8846 else if (char_encodable_p (c, elt))
8847 tail = XCDR (tail);
8848 else if (CONSP (XCDR (tail)))
8850 XSETCAR (tail, XCAR (XCDR (tail)));
8851 XSETCDR (tail, XCDR (XCDR (tail)));
8853 else
8855 XSETCAR (tail, Qnil);
8856 tail = XCDR (tail);
8859 if (charset_map_loaded)
8861 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8863 if (STRINGP (start))
8864 pbeg = SDATA (start);
8865 else
8866 pbeg = BYTE_POS_ADDR (start_byte);
8867 p = pbeg + p_offset;
8868 pend = pbeg + pend_offset;
8870 char_table_set (work_table, c, Qt);
8874 safe_codings = list2 (Qraw_text, Qno_conversion);
8875 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8876 if (! NILP (XCAR (tail)))
8877 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8879 return safe_codings;
8883 DEFUN ("unencodable-char-position", Funencodable_char_position,
8884 Sunencodable_char_position, 3, 5, 0,
8885 doc: /*
8886 Return position of first un-encodable character in a region.
8887 START and END specify the region and CODING-SYSTEM specifies the
8888 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8890 If optional 4th argument COUNT is non-nil, it specifies at most how
8891 many un-encodable characters to search. In this case, the value is a
8892 list of positions.
8894 If optional 5th argument STRING is non-nil, it is a string to search
8895 for un-encodable characters. In that case, START and END are indexes
8896 to the string. */)
8897 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8899 EMACS_INT n;
8900 struct coding_system coding;
8901 Lisp_Object attrs, charset_list, translation_table;
8902 Lisp_Object positions;
8903 ptrdiff_t from, to;
8904 const unsigned char *p, *stop, *pend;
8905 bool ascii_compatible;
8907 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8908 attrs = CODING_ID_ATTRS (coding.id);
8909 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8910 return Qnil;
8911 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8912 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8913 translation_table = get_translation_table (attrs, 1, NULL);
8915 if (NILP (string))
8917 validate_region (&start, &end);
8918 from = XINT (start);
8919 to = XINT (end);
8920 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8921 || (ascii_compatible
8922 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8923 return Qnil;
8924 p = CHAR_POS_ADDR (from);
8925 pend = CHAR_POS_ADDR (to);
8926 if (from < GPT && to >= GPT)
8927 stop = GPT_ADDR;
8928 else
8929 stop = pend;
8931 else
8933 CHECK_STRING (string);
8934 CHECK_NATNUM (start);
8935 CHECK_NATNUM (end);
8936 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8937 args_out_of_range_3 (string, start, end);
8938 from = XINT (start);
8939 to = XINT (end);
8940 if (! STRING_MULTIBYTE (string))
8941 return Qnil;
8942 p = SDATA (string) + string_char_to_byte (string, from);
8943 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8944 if (ascii_compatible && (to - from) == (pend - p))
8945 return Qnil;
8948 if (NILP (count))
8949 n = 1;
8950 else
8952 CHECK_NATNUM (count);
8953 n = XINT (count);
8956 positions = Qnil;
8957 charset_map_loaded = 0;
8958 while (1)
8960 int c;
8962 if (ascii_compatible)
8963 while (p < stop && ASCII_BYTE_P (*p))
8964 p++, from++;
8965 if (p >= stop)
8967 if (p >= pend)
8968 break;
8969 stop = pend;
8970 p = GAP_END_ADDR;
8973 c = STRING_CHAR_ADVANCE (p);
8974 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8975 && ! char_charset (translate_char (translation_table, c),
8976 charset_list, NULL))
8978 positions = Fcons (make_number (from), positions);
8979 n--;
8980 if (n == 0)
8981 break;
8984 from++;
8985 if (charset_map_loaded && NILP (string))
8987 p = CHAR_POS_ADDR (from);
8988 pend = CHAR_POS_ADDR (to);
8989 if (from < GPT && to >= GPT)
8990 stop = GPT_ADDR;
8991 else
8992 stop = pend;
8993 charset_map_loaded = 0;
8997 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9001 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9002 Scheck_coding_systems_region, 3, 3, 0,
9003 doc: /* Check if the region is encodable by coding systems.
9005 START and END are buffer positions specifying the region.
9006 CODING-SYSTEM-LIST is a list of coding systems to check.
9008 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9009 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9010 whole region, POS0, POS1, ... are buffer positions where non-encodable
9011 characters are found.
9013 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9014 value is nil.
9016 START may be a string. In that case, check if the string is
9017 encodable, and the value contains indices to the string instead of
9018 buffer positions. END is ignored.
9020 If the current buffer (or START if it is a string) is unibyte, the value
9021 is nil. */)
9022 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9024 Lisp_Object list;
9025 ptrdiff_t start_byte, end_byte;
9026 ptrdiff_t pos;
9027 const unsigned char *p, *pbeg, *pend;
9028 int c;
9029 Lisp_Object tail, elt, attrs;
9031 if (STRINGP (start))
9033 if (!STRING_MULTIBYTE (start)
9034 || SCHARS (start) == SBYTES (start))
9035 return Qnil;
9036 start_byte = 0;
9037 end_byte = SBYTES (start);
9038 pos = 0;
9040 else
9042 CHECK_NUMBER_COERCE_MARKER (start);
9043 CHECK_NUMBER_COERCE_MARKER (end);
9044 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9045 args_out_of_range (start, end);
9046 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9047 return Qnil;
9048 start_byte = CHAR_TO_BYTE (XINT (start));
9049 end_byte = CHAR_TO_BYTE (XINT (end));
9050 if (XINT (end) - XINT (start) == end_byte - start_byte)
9051 return Qnil;
9053 if (XINT (start) < GPT && XINT (end) > GPT)
9055 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9056 move_gap_both (XINT (start), start_byte);
9057 else
9058 move_gap_both (XINT (end), end_byte);
9060 pos = XINT (start);
9063 list = Qnil;
9064 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9066 elt = XCAR (tail);
9067 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9068 ASET (attrs, coding_attr_trans_tbl,
9069 get_translation_table (attrs, 1, NULL));
9070 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9073 if (STRINGP (start))
9074 p = pbeg = SDATA (start);
9075 else
9076 p = pbeg = BYTE_POS_ADDR (start_byte);
9077 pend = p + (end_byte - start_byte);
9079 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9080 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9082 while (p < pend)
9084 if (ASCII_BYTE_P (*p))
9085 p++;
9086 else
9088 c = STRING_CHAR_ADVANCE (p);
9090 charset_map_loaded = 0;
9091 for (tail = list; CONSP (tail); tail = XCDR (tail))
9093 elt = XCDR (XCAR (tail));
9094 if (! char_encodable_p (c, XCAR (elt)))
9095 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9097 if (charset_map_loaded)
9099 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9101 if (STRINGP (start))
9102 pbeg = SDATA (start);
9103 else
9104 pbeg = BYTE_POS_ADDR (start_byte);
9105 p = pbeg + p_offset;
9106 pend = pbeg + pend_offset;
9109 pos++;
9112 tail = list;
9113 list = Qnil;
9114 for (; CONSP (tail); tail = XCDR (tail))
9116 elt = XCAR (tail);
9117 if (CONSP (XCDR (XCDR (elt))))
9118 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9119 list);
9122 return list;
9126 static Lisp_Object
9127 code_convert_region (Lisp_Object start, Lisp_Object end,
9128 Lisp_Object coding_system, Lisp_Object dst_object,
9129 bool encodep, bool norecord)
9131 struct coding_system coding;
9132 ptrdiff_t from, from_byte, to, to_byte;
9133 Lisp_Object src_object;
9135 if (NILP (coding_system))
9136 coding_system = Qno_conversion;
9137 else
9138 CHECK_CODING_SYSTEM (coding_system);
9139 src_object = Fcurrent_buffer ();
9140 if (NILP (dst_object))
9141 dst_object = src_object;
9142 else if (! EQ (dst_object, Qt))
9143 CHECK_BUFFER (dst_object);
9145 validate_region (&start, &end);
9146 from = XFASTINT (start);
9147 from_byte = CHAR_TO_BYTE (from);
9148 to = XFASTINT (end);
9149 to_byte = CHAR_TO_BYTE (to);
9151 setup_coding_system (coding_system, &coding);
9152 coding.mode |= CODING_MODE_LAST_BLOCK;
9154 if (encodep)
9155 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9156 dst_object);
9157 else
9158 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9159 dst_object);
9160 if (! norecord)
9161 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9163 return (BUFFERP (dst_object)
9164 ? make_number (coding.produced_char)
9165 : coding.dst_object);
9169 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9170 3, 4, "r\nzCoding system: ",
9171 doc: /* Decode the current region from the specified coding system.
9172 When called from a program, takes four arguments:
9173 START, END, CODING-SYSTEM, and DESTINATION.
9174 START and END are buffer positions.
9176 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9177 If nil, the region between START and END is replaced by the decoded text.
9178 If buffer, the decoded text is inserted in that buffer after point (point
9179 does not move).
9180 In those cases, the length of the decoded text is returned.
9181 If DESTINATION is t, the decoded text is returned.
9183 This function sets `last-coding-system-used' to the precise coding system
9184 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9185 not fully specified.) */)
9186 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9188 return code_convert_region (start, end, coding_system, destination, 0, 0);
9191 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9192 3, 4, "r\nzCoding system: ",
9193 doc: /* Encode the current region by specified coding system.
9194 When called from a program, takes four arguments:
9195 START, END, CODING-SYSTEM and DESTINATION.
9196 START and END are buffer positions.
9198 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9199 If nil, the region between START and END is replace by the encoded text.
9200 If buffer, the encoded text is inserted in that buffer after point (point
9201 does not move).
9202 In those cases, the length of the encoded text is returned.
9203 If DESTINATION is t, the encoded text is returned.
9205 This function sets `last-coding-system-used' to the precise coding system
9206 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9207 not fully specified.) */)
9208 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9210 return code_convert_region (start, end, coding_system, destination, 1, 0);
9213 Lisp_Object
9214 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9215 Lisp_Object dst_object, bool encodep, bool nocopy,
9216 bool norecord)
9218 struct coding_system coding;
9219 ptrdiff_t chars, bytes;
9221 CHECK_STRING (string);
9222 if (NILP (coding_system))
9224 if (! norecord)
9225 Vlast_coding_system_used = Qno_conversion;
9226 if (NILP (dst_object))
9227 return (nocopy ? Fcopy_sequence (string) : string);
9230 if (NILP (coding_system))
9231 coding_system = Qno_conversion;
9232 else
9233 CHECK_CODING_SYSTEM (coding_system);
9234 if (NILP (dst_object))
9235 dst_object = Qt;
9236 else if (! EQ (dst_object, Qt))
9237 CHECK_BUFFER (dst_object);
9239 setup_coding_system (coding_system, &coding);
9240 coding.mode |= CODING_MODE_LAST_BLOCK;
9241 chars = SCHARS (string);
9242 bytes = SBYTES (string);
9243 if (encodep)
9244 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9245 else
9246 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9247 if (! norecord)
9248 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9250 return (BUFFERP (dst_object)
9251 ? make_number (coding.produced_char)
9252 : coding.dst_object);
9256 /* Encode or decode STRING according to CODING_SYSTEM.
9257 Do not set Vlast_coding_system_used.
9259 This function is called only from macros DECODE_FILE and
9260 ENCODE_FILE, thus we ignore character composition. */
9262 Lisp_Object
9263 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9264 bool encodep)
9266 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9270 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9271 2, 4, 0,
9272 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9274 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9275 if the decoding operation is trivial.
9277 Optional fourth arg BUFFER non-nil means that the decoded text is
9278 inserted in that buffer after point (point does not move). In this
9279 case, the return value is the length of the decoded text.
9281 This function sets `last-coding-system-used' to the precise coding system
9282 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9283 not fully specified.) */)
9284 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9286 return code_convert_string (string, coding_system, buffer,
9287 0, ! NILP (nocopy), 0);
9290 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9291 2, 4, 0,
9292 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9294 Optional third arg NOCOPY non-nil means it is OK to return STRING
9295 itself if the encoding operation is trivial.
9297 Optional fourth arg BUFFER non-nil means that the encoded text is
9298 inserted in that buffer after point (point does not move). In this
9299 case, the return value is the length of the encoded text.
9301 This function sets `last-coding-system-used' to the precise coding system
9302 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9303 not fully specified.) */)
9304 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9306 return code_convert_string (string, coding_system, buffer,
9307 1, ! NILP (nocopy), 0);
9311 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9312 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9313 Return the corresponding character. */)
9314 (Lisp_Object code)
9316 Lisp_Object spec, attrs, val;
9317 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9318 EMACS_INT ch;
9319 int c;
9321 CHECK_NATNUM (code);
9322 ch = XFASTINT (code);
9323 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9324 attrs = AREF (spec, 0);
9326 if (ASCII_BYTE_P (ch)
9327 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9328 return code;
9330 val = CODING_ATTR_CHARSET_LIST (attrs);
9331 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9332 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9333 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9335 if (ch <= 0x7F)
9337 c = ch;
9338 charset = charset_roman;
9340 else if (ch >= 0xA0 && ch < 0xDF)
9342 c = ch - 0x80;
9343 charset = charset_kana;
9345 else
9347 EMACS_INT c1 = ch >> 8;
9348 int c2 = ch & 0xFF;
9350 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9351 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9352 error ("Invalid code: %"pI"d", ch);
9353 c = ch;
9354 SJIS_TO_JIS (c);
9355 charset = charset_kanji;
9357 c = DECODE_CHAR (charset, c);
9358 if (c < 0)
9359 error ("Invalid code: %"pI"d", ch);
9360 return make_number (c);
9364 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9365 doc: /* Encode a Japanese character CH to shift_jis encoding.
9366 Return the corresponding code in SJIS. */)
9367 (Lisp_Object ch)
9369 Lisp_Object spec, attrs, charset_list;
9370 int c;
9371 struct charset *charset;
9372 unsigned code;
9374 CHECK_CHARACTER (ch);
9375 c = XFASTINT (ch);
9376 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9377 attrs = AREF (spec, 0);
9379 if (ASCII_CHAR_P (c)
9380 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9381 return ch;
9383 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9384 charset = char_charset (c, charset_list, &code);
9385 if (code == CHARSET_INVALID_CODE (charset))
9386 error ("Can't encode by shift_jis encoding: %c", c);
9387 JIS_TO_SJIS (code);
9389 return make_number (code);
9392 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9393 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9394 Return the corresponding character. */)
9395 (Lisp_Object code)
9397 Lisp_Object spec, attrs, val;
9398 struct charset *charset_roman, *charset_big5, *charset;
9399 EMACS_INT ch;
9400 int c;
9402 CHECK_NATNUM (code);
9403 ch = XFASTINT (code);
9404 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9405 attrs = AREF (spec, 0);
9407 if (ASCII_BYTE_P (ch)
9408 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9409 return code;
9411 val = CODING_ATTR_CHARSET_LIST (attrs);
9412 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9413 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9415 if (ch <= 0x7F)
9417 c = ch;
9418 charset = charset_roman;
9420 else
9422 EMACS_INT b1 = ch >> 8;
9423 int b2 = ch & 0x7F;
9424 if (b1 < 0xA1 || b1 > 0xFE
9425 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9426 error ("Invalid code: %"pI"d", ch);
9427 c = ch;
9428 charset = charset_big5;
9430 c = DECODE_CHAR (charset, c);
9431 if (c < 0)
9432 error ("Invalid code: %"pI"d", ch);
9433 return make_number (c);
9436 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9437 doc: /* Encode the Big5 character CH to BIG5 coding system.
9438 Return the corresponding character code in Big5. */)
9439 (Lisp_Object ch)
9441 Lisp_Object spec, attrs, charset_list;
9442 struct charset *charset;
9443 int c;
9444 unsigned code;
9446 CHECK_CHARACTER (ch);
9447 c = XFASTINT (ch);
9448 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9449 attrs = AREF (spec, 0);
9450 if (ASCII_CHAR_P (c)
9451 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9452 return ch;
9454 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9455 charset = char_charset (c, charset_list, &code);
9456 if (code == CHARSET_INVALID_CODE (charset))
9457 error ("Can't encode by Big5 encoding: %c", c);
9459 return make_number (code);
9463 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9464 Sset_terminal_coding_system_internal, 1, 2, 0,
9465 doc: /* Internal use only. */)
9466 (Lisp_Object coding_system, Lisp_Object terminal)
9468 struct terminal *term = get_terminal (terminal, 1);
9469 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9470 CHECK_SYMBOL (coding_system);
9471 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9472 /* We had better not send unsafe characters to terminal. */
9473 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9474 /* Character composition should be disabled. */
9475 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9476 terminal_coding->src_multibyte = 1;
9477 terminal_coding->dst_multibyte = 0;
9478 tset_charset_list
9479 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9480 ? coding_charset_list (terminal_coding)
9481 : Fcons (make_number (charset_ascii), Qnil)));
9482 return Qnil;
9485 DEFUN ("set-safe-terminal-coding-system-internal",
9486 Fset_safe_terminal_coding_system_internal,
9487 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9488 doc: /* Internal use only. */)
9489 (Lisp_Object coding_system)
9491 CHECK_SYMBOL (coding_system);
9492 setup_coding_system (Fcheck_coding_system (coding_system),
9493 &safe_terminal_coding);
9494 /* Character composition should be disabled. */
9495 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9496 safe_terminal_coding.src_multibyte = 1;
9497 safe_terminal_coding.dst_multibyte = 0;
9498 return Qnil;
9501 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9502 Sterminal_coding_system, 0, 1, 0,
9503 doc: /* Return coding system specified for terminal output on the given terminal.
9504 TERMINAL may be a terminal object, a frame, or nil for the selected
9505 frame's terminal device. */)
9506 (Lisp_Object terminal)
9508 struct coding_system *terminal_coding
9509 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9510 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9512 /* For backward compatibility, return nil if it is `undecided'. */
9513 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9516 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9517 Sset_keyboard_coding_system_internal, 1, 2, 0,
9518 doc: /* Internal use only. */)
9519 (Lisp_Object coding_system, Lisp_Object terminal)
9521 struct terminal *t = get_terminal (terminal, 1);
9522 CHECK_SYMBOL (coding_system);
9523 if (NILP (coding_system))
9524 coding_system = Qno_conversion;
9525 else
9526 Fcheck_coding_system (coding_system);
9527 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9528 /* Character composition should be disabled. */
9529 TERMINAL_KEYBOARD_CODING (t)->common_flags
9530 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9531 return Qnil;
9534 DEFUN ("keyboard-coding-system",
9535 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9536 doc: /* Return coding system specified for decoding keyboard input. */)
9537 (Lisp_Object terminal)
9539 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9540 (get_terminal (terminal, 1))->id);
9544 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9545 Sfind_operation_coding_system, 1, MANY, 0,
9546 doc: /* Choose a coding system for an operation based on the target name.
9547 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9548 DECODING-SYSTEM is the coding system to use for decoding
9549 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9550 for encoding (in case OPERATION does encoding).
9552 The first argument OPERATION specifies an I/O primitive:
9553 For file I/O, `insert-file-contents' or `write-region'.
9554 For process I/O, `call-process', `call-process-region', or `start-process'.
9555 For network I/O, `open-network-stream'.
9557 The remaining arguments should be the same arguments that were passed
9558 to the primitive. Depending on which primitive, one of those arguments
9559 is selected as the TARGET. For example, if OPERATION does file I/O,
9560 whichever argument specifies the file name is TARGET.
9562 TARGET has a meaning which depends on OPERATION:
9563 For file I/O, TARGET is a file name (except for the special case below).
9564 For process I/O, TARGET is a process name.
9565 For network I/O, TARGET is a service name or a port number.
9567 This function looks up what is specified for TARGET in
9568 `file-coding-system-alist', `process-coding-system-alist',
9569 or `network-coding-system-alist' depending on OPERATION.
9570 They may specify a coding system, a cons of coding systems,
9571 or a function symbol to call.
9572 In the last case, we call the function with one argument,
9573 which is a list of all the arguments given to this function.
9574 If the function can't decide a coding system, it can return
9575 `undecided' so that the normal code-detection is performed.
9577 If OPERATION is `insert-file-contents', the argument corresponding to
9578 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9579 file name to look up, and BUFFER is a buffer that contains the file's
9580 contents (not yet decoded). If `file-coding-system-alist' specifies a
9581 function to call for FILENAME, that function should examine the
9582 contents of BUFFER instead of reading the file.
9584 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9585 (ptrdiff_t nargs, Lisp_Object *args)
9587 Lisp_Object operation, target_idx, target, val;
9588 register Lisp_Object chain;
9590 if (nargs < 2)
9591 error ("Too few arguments");
9592 operation = args[0];
9593 if (!SYMBOLP (operation)
9594 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9595 error ("Invalid first argument");
9596 if (nargs <= 1 + XFASTINT (target_idx))
9597 error ("Too few arguments for operation `%s'",
9598 SDATA (SYMBOL_NAME (operation)));
9599 target = args[XFASTINT (target_idx) + 1];
9600 if (!(STRINGP (target)
9601 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9602 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9603 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9604 error ("Invalid argument %"pI"d of operation `%s'",
9605 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9606 if (CONSP (target))
9607 target = XCAR (target);
9609 chain = ((EQ (operation, Qinsert_file_contents)
9610 || EQ (operation, Qwrite_region))
9611 ? Vfile_coding_system_alist
9612 : (EQ (operation, Qopen_network_stream)
9613 ? Vnetwork_coding_system_alist
9614 : Vprocess_coding_system_alist));
9615 if (NILP (chain))
9616 return Qnil;
9618 for (; CONSP (chain); chain = XCDR (chain))
9620 Lisp_Object elt;
9622 elt = XCAR (chain);
9623 if (CONSP (elt)
9624 && ((STRINGP (target)
9625 && STRINGP (XCAR (elt))
9626 && fast_string_match (XCAR (elt), target) >= 0)
9627 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9629 val = XCDR (elt);
9630 /* Here, if VAL is both a valid coding system and a valid
9631 function symbol, we return VAL as a coding system. */
9632 if (CONSP (val))
9633 return val;
9634 if (! SYMBOLP (val))
9635 return Qnil;
9636 if (! NILP (Fcoding_system_p (val)))
9637 return Fcons (val, val);
9638 if (! NILP (Ffboundp (val)))
9640 /* We use call1 rather than safe_call1
9641 so as to get bug reports about functions called here
9642 which don't handle the current interface. */
9643 val = call1 (val, Flist (nargs, args));
9644 if (CONSP (val))
9645 return val;
9646 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9647 return Fcons (val, val);
9649 return Qnil;
9652 return Qnil;
9655 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9656 Sset_coding_system_priority, 0, MANY, 0,
9657 doc: /* Assign higher priority to the coding systems given as arguments.
9658 If multiple coding systems belong to the same category,
9659 all but the first one are ignored.
9661 usage: (set-coding-system-priority &rest coding-systems) */)
9662 (ptrdiff_t nargs, Lisp_Object *args)
9664 ptrdiff_t i, j;
9665 bool changed[coding_category_max];
9666 enum coding_category priorities[coding_category_max];
9668 memset (changed, 0, sizeof changed);
9670 for (i = j = 0; i < nargs; i++)
9672 enum coding_category category;
9673 Lisp_Object spec, attrs;
9675 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9676 attrs = AREF (spec, 0);
9677 category = XINT (CODING_ATTR_CATEGORY (attrs));
9678 if (changed[category])
9679 /* Ignore this coding system because a coding system of the
9680 same category already had a higher priority. */
9681 continue;
9682 changed[category] = 1;
9683 priorities[j++] = category;
9684 if (coding_categories[category].id >= 0
9685 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9686 setup_coding_system (args[i], &coding_categories[category]);
9687 Fset (AREF (Vcoding_category_table, category), args[i]);
9690 /* Now we have decided top J priorities. Reflect the order of the
9691 original priorities to the remaining priorities. */
9693 for (i = j, j = 0; i < coding_category_max; i++, j++)
9695 while (j < coding_category_max
9696 && changed[coding_priorities[j]])
9697 j++;
9698 if (j == coding_category_max)
9699 emacs_abort ();
9700 priorities[i] = coding_priorities[j];
9703 memcpy (coding_priorities, priorities, sizeof priorities);
9705 /* Update `coding-category-list'. */
9706 Vcoding_category_list = Qnil;
9707 for (i = coding_category_max; i-- > 0; )
9708 Vcoding_category_list
9709 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9710 Vcoding_category_list);
9712 return Qnil;
9715 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9716 Scoding_system_priority_list, 0, 1, 0,
9717 doc: /* Return a list of coding systems ordered by their priorities.
9718 The list contains a subset of coding systems; i.e. coding systems
9719 assigned to each coding category (see `coding-category-list').
9721 HIGHESTP non-nil means just return the highest priority one. */)
9722 (Lisp_Object highestp)
9724 int i;
9725 Lisp_Object val;
9727 for (i = 0, val = Qnil; i < coding_category_max; i++)
9729 enum coding_category category = coding_priorities[i];
9730 int id = coding_categories[category].id;
9731 Lisp_Object attrs;
9733 if (id < 0)
9734 continue;
9735 attrs = CODING_ID_ATTRS (id);
9736 if (! NILP (highestp))
9737 return CODING_ATTR_BASE_NAME (attrs);
9738 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9740 return Fnreverse (val);
9743 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9745 static Lisp_Object
9746 make_subsidiaries (Lisp_Object base)
9748 Lisp_Object subsidiaries;
9749 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9750 char *buf = alloca (base_name_len + 6);
9751 int i;
9753 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9754 subsidiaries = make_uninit_vector (3);
9755 for (i = 0; i < 3; i++)
9757 strcpy (buf + base_name_len, suffixes[i]);
9758 ASET (subsidiaries, i, intern (buf));
9760 return subsidiaries;
9764 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9765 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9766 doc: /* For internal use only.
9767 usage: (define-coding-system-internal ...) */)
9768 (ptrdiff_t nargs, Lisp_Object *args)
9770 Lisp_Object name;
9771 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9772 Lisp_Object attrs; /* Vector of attributes. */
9773 Lisp_Object eol_type;
9774 Lisp_Object aliases;
9775 Lisp_Object coding_type, charset_list, safe_charsets;
9776 enum coding_category category;
9777 Lisp_Object tail, val;
9778 int max_charset_id = 0;
9779 int i;
9781 if (nargs < coding_arg_max)
9782 goto short_args;
9784 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9786 name = args[coding_arg_name];
9787 CHECK_SYMBOL (name);
9788 ASET (attrs, coding_attr_base_name, name);
9790 val = args[coding_arg_mnemonic];
9791 if (! STRINGP (val))
9792 CHECK_CHARACTER (val);
9793 ASET (attrs, coding_attr_mnemonic, val);
9795 coding_type = args[coding_arg_coding_type];
9796 CHECK_SYMBOL (coding_type);
9797 ASET (attrs, coding_attr_type, coding_type);
9799 charset_list = args[coding_arg_charset_list];
9800 if (SYMBOLP (charset_list))
9802 if (EQ (charset_list, Qiso_2022))
9804 if (! EQ (coding_type, Qiso_2022))
9805 error ("Invalid charset-list");
9806 charset_list = Viso_2022_charset_list;
9808 else if (EQ (charset_list, Qemacs_mule))
9810 if (! EQ (coding_type, Qemacs_mule))
9811 error ("Invalid charset-list");
9812 charset_list = Vemacs_mule_charset_list;
9814 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9816 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9817 error ("Invalid charset-list");
9818 if (max_charset_id < XFASTINT (XCAR (tail)))
9819 max_charset_id = XFASTINT (XCAR (tail));
9822 else
9824 charset_list = Fcopy_sequence (charset_list);
9825 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9827 struct charset *charset;
9829 val = XCAR (tail);
9830 CHECK_CHARSET_GET_CHARSET (val, charset);
9831 if (EQ (coding_type, Qiso_2022)
9832 ? CHARSET_ISO_FINAL (charset) < 0
9833 : EQ (coding_type, Qemacs_mule)
9834 ? CHARSET_EMACS_MULE_ID (charset) < 0
9835 : 0)
9836 error ("Can't handle charset `%s'",
9837 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9839 XSETCAR (tail, make_number (charset->id));
9840 if (max_charset_id < charset->id)
9841 max_charset_id = charset->id;
9844 ASET (attrs, coding_attr_charset_list, charset_list);
9846 safe_charsets = make_uninit_string (max_charset_id + 1);
9847 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9848 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9849 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9850 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9852 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9854 val = args[coding_arg_decode_translation_table];
9855 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9856 CHECK_SYMBOL (val);
9857 ASET (attrs, coding_attr_decode_tbl, val);
9859 val = args[coding_arg_encode_translation_table];
9860 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9861 CHECK_SYMBOL (val);
9862 ASET (attrs, coding_attr_encode_tbl, val);
9864 val = args[coding_arg_post_read_conversion];
9865 CHECK_SYMBOL (val);
9866 ASET (attrs, coding_attr_post_read, val);
9868 val = args[coding_arg_pre_write_conversion];
9869 CHECK_SYMBOL (val);
9870 ASET (attrs, coding_attr_pre_write, val);
9872 val = args[coding_arg_default_char];
9873 if (NILP (val))
9874 ASET (attrs, coding_attr_default_char, make_number (' '));
9875 else
9877 CHECK_CHARACTER (val);
9878 ASET (attrs, coding_attr_default_char, val);
9881 val = args[coding_arg_for_unibyte];
9882 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9884 val = args[coding_arg_plist];
9885 CHECK_LIST (val);
9886 ASET (attrs, coding_attr_plist, val);
9888 if (EQ (coding_type, Qcharset))
9890 /* Generate a lisp vector of 256 elements. Each element is nil,
9891 integer, or a list of charset IDs.
9893 If Nth element is nil, the byte code N is invalid in this
9894 coding system.
9896 If Nth element is a number NUM, N is the first byte of a
9897 charset whose ID is NUM.
9899 If Nth element is a list of charset IDs, N is the first byte
9900 of one of them. The list is sorted by dimensions of the
9901 charsets. A charset of smaller dimension comes first. */
9902 val = Fmake_vector (make_number (256), Qnil);
9904 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9906 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9907 int dim = CHARSET_DIMENSION (charset);
9908 int idx = (dim - 1) * 4;
9910 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9911 ASET (attrs, coding_attr_ascii_compat, Qt);
9913 for (i = charset->code_space[idx];
9914 i <= charset->code_space[idx + 1]; i++)
9916 Lisp_Object tmp, tmp2;
9917 int dim2;
9919 tmp = AREF (val, i);
9920 if (NILP (tmp))
9921 tmp = XCAR (tail);
9922 else if (NUMBERP (tmp))
9924 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9925 if (dim < dim2)
9926 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9927 else
9928 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9930 else
9932 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9934 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9935 if (dim < dim2)
9936 break;
9938 if (NILP (tmp2))
9939 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9940 else
9942 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9943 XSETCAR (tmp2, XCAR (tail));
9946 ASET (val, i, tmp);
9949 ASET (attrs, coding_attr_charset_valids, val);
9950 category = coding_category_charset;
9952 else if (EQ (coding_type, Qccl))
9954 Lisp_Object valids;
9956 if (nargs < coding_arg_ccl_max)
9957 goto short_args;
9959 val = args[coding_arg_ccl_decoder];
9960 CHECK_CCL_PROGRAM (val);
9961 if (VECTORP (val))
9962 val = Fcopy_sequence (val);
9963 ASET (attrs, coding_attr_ccl_decoder, val);
9965 val = args[coding_arg_ccl_encoder];
9966 CHECK_CCL_PROGRAM (val);
9967 if (VECTORP (val))
9968 val = Fcopy_sequence (val);
9969 ASET (attrs, coding_attr_ccl_encoder, val);
9971 val = args[coding_arg_ccl_valids];
9972 valids = Fmake_string (make_number (256), make_number (0));
9973 for (tail = val; CONSP (tail); tail = XCDR (tail))
9975 int from, to;
9977 val = XCAR (tail);
9978 if (INTEGERP (val))
9980 if (! (0 <= XINT (val) && XINT (val) <= 255))
9981 args_out_of_range_3 (val, make_number (0), make_number (255));
9982 from = to = XINT (val);
9984 else
9986 CHECK_CONS (val);
9987 CHECK_NATNUM_CAR (val);
9988 CHECK_NUMBER_CDR (val);
9989 if (XINT (XCAR (val)) > 255)
9990 args_out_of_range_3 (XCAR (val),
9991 make_number (0), make_number (255));
9992 from = XINT (XCAR (val));
9993 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9994 args_out_of_range_3 (XCDR (val),
9995 XCAR (val), make_number (255));
9996 to = XINT (XCDR (val));
9998 for (i = from; i <= to; i++)
9999 SSET (valids, i, 1);
10001 ASET (attrs, coding_attr_ccl_valids, valids);
10003 category = coding_category_ccl;
10005 else if (EQ (coding_type, Qutf_16))
10007 Lisp_Object bom, endian;
10009 ASET (attrs, coding_attr_ascii_compat, Qnil);
10011 if (nargs < coding_arg_utf16_max)
10012 goto short_args;
10014 bom = args[coding_arg_utf16_bom];
10015 if (! NILP (bom) && ! EQ (bom, Qt))
10017 CHECK_CONS (bom);
10018 val = XCAR (bom);
10019 CHECK_CODING_SYSTEM (val);
10020 val = XCDR (bom);
10021 CHECK_CODING_SYSTEM (val);
10023 ASET (attrs, coding_attr_utf_bom, bom);
10025 endian = args[coding_arg_utf16_endian];
10026 CHECK_SYMBOL (endian);
10027 if (NILP (endian))
10028 endian = Qbig;
10029 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10030 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10031 ASET (attrs, coding_attr_utf_16_endian, endian);
10033 category = (CONSP (bom)
10034 ? coding_category_utf_16_auto
10035 : NILP (bom)
10036 ? (EQ (endian, Qbig)
10037 ? coding_category_utf_16_be_nosig
10038 : coding_category_utf_16_le_nosig)
10039 : (EQ (endian, Qbig)
10040 ? coding_category_utf_16_be
10041 : coding_category_utf_16_le));
10043 else if (EQ (coding_type, Qiso_2022))
10045 Lisp_Object initial, reg_usage, request, flags;
10047 if (nargs < coding_arg_iso2022_max)
10048 goto short_args;
10050 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10051 CHECK_VECTOR (initial);
10052 for (i = 0; i < 4; i++)
10054 val = AREF (initial, i);
10055 if (! NILP (val))
10057 struct charset *charset;
10059 CHECK_CHARSET_GET_CHARSET (val, charset);
10060 ASET (initial, i, make_number (CHARSET_ID (charset)));
10061 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10062 ASET (attrs, coding_attr_ascii_compat, Qt);
10064 else
10065 ASET (initial, i, make_number (-1));
10068 reg_usage = args[coding_arg_iso2022_reg_usage];
10069 CHECK_CONS (reg_usage);
10070 CHECK_NUMBER_CAR (reg_usage);
10071 CHECK_NUMBER_CDR (reg_usage);
10073 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10074 for (tail = request; CONSP (tail); tail = XCDR (tail))
10076 int id;
10077 Lisp_Object tmp1;
10079 val = XCAR (tail);
10080 CHECK_CONS (val);
10081 tmp1 = XCAR (val);
10082 CHECK_CHARSET_GET_ID (tmp1, id);
10083 CHECK_NATNUM_CDR (val);
10084 if (XINT (XCDR (val)) >= 4)
10085 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10086 XSETCAR (val, make_number (id));
10089 flags = args[coding_arg_iso2022_flags];
10090 CHECK_NATNUM (flags);
10091 i = XINT (flags) & INT_MAX;
10092 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10093 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10094 flags = make_number (i);
10096 ASET (attrs, coding_attr_iso_initial, initial);
10097 ASET (attrs, coding_attr_iso_usage, reg_usage);
10098 ASET (attrs, coding_attr_iso_request, request);
10099 ASET (attrs, coding_attr_iso_flags, flags);
10100 setup_iso_safe_charsets (attrs);
10102 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10103 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10104 | CODING_ISO_FLAG_SINGLE_SHIFT))
10105 ? coding_category_iso_7_else
10106 : EQ (args[coding_arg_charset_list], Qiso_2022)
10107 ? coding_category_iso_7
10108 : coding_category_iso_7_tight);
10109 else
10111 int id = XINT (AREF (initial, 1));
10113 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10114 || EQ (args[coding_arg_charset_list], Qiso_2022)
10115 || id < 0)
10116 ? coding_category_iso_8_else
10117 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10118 ? coding_category_iso_8_1
10119 : coding_category_iso_8_2);
10121 if (category != coding_category_iso_8_1
10122 && category != coding_category_iso_8_2)
10123 ASET (attrs, coding_attr_ascii_compat, Qnil);
10125 else if (EQ (coding_type, Qemacs_mule))
10127 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10128 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10129 ASET (attrs, coding_attr_ascii_compat, Qt);
10130 category = coding_category_emacs_mule;
10132 else if (EQ (coding_type, Qshift_jis))
10135 struct charset *charset;
10137 if (XINT (Flength (charset_list)) != 3
10138 && XINT (Flength (charset_list)) != 4)
10139 error ("There should be three or four charsets");
10141 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10142 if (CHARSET_DIMENSION (charset) != 1)
10143 error ("Dimension of charset %s is not one",
10144 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10145 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10146 ASET (attrs, coding_attr_ascii_compat, Qt);
10148 charset_list = XCDR (charset_list);
10149 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10150 if (CHARSET_DIMENSION (charset) != 1)
10151 error ("Dimension of charset %s is not one",
10152 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10154 charset_list = XCDR (charset_list);
10155 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10156 if (CHARSET_DIMENSION (charset) != 2)
10157 error ("Dimension of charset %s is not two",
10158 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10160 charset_list = XCDR (charset_list);
10161 if (! NILP (charset_list))
10163 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10164 if (CHARSET_DIMENSION (charset) != 2)
10165 error ("Dimension of charset %s is not two",
10166 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10169 category = coding_category_sjis;
10170 Vsjis_coding_system = name;
10172 else if (EQ (coding_type, Qbig5))
10174 struct charset *charset;
10176 if (XINT (Flength (charset_list)) != 2)
10177 error ("There should be just two charsets");
10179 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10180 if (CHARSET_DIMENSION (charset) != 1)
10181 error ("Dimension of charset %s is not one",
10182 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10183 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10184 ASET (attrs, coding_attr_ascii_compat, Qt);
10186 charset_list = XCDR (charset_list);
10187 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10188 if (CHARSET_DIMENSION (charset) != 2)
10189 error ("Dimension of charset %s is not two",
10190 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10192 category = coding_category_big5;
10193 Vbig5_coding_system = name;
10195 else if (EQ (coding_type, Qraw_text))
10197 category = coding_category_raw_text;
10198 ASET (attrs, coding_attr_ascii_compat, Qt);
10200 else if (EQ (coding_type, Qutf_8))
10202 Lisp_Object bom;
10204 if (nargs < coding_arg_utf8_max)
10205 goto short_args;
10207 bom = args[coding_arg_utf8_bom];
10208 if (! NILP (bom) && ! EQ (bom, Qt))
10210 CHECK_CONS (bom);
10211 val = XCAR (bom);
10212 CHECK_CODING_SYSTEM (val);
10213 val = XCDR (bom);
10214 CHECK_CODING_SYSTEM (val);
10216 ASET (attrs, coding_attr_utf_bom, bom);
10217 if (NILP (bom))
10218 ASET (attrs, coding_attr_ascii_compat, Qt);
10220 category = (CONSP (bom) ? coding_category_utf_8_auto
10221 : NILP (bom) ? coding_category_utf_8_nosig
10222 : coding_category_utf_8_sig);
10224 else if (EQ (coding_type, Qundecided))
10225 category = coding_category_undecided;
10226 else
10227 error ("Invalid coding system type: %s",
10228 SDATA (SYMBOL_NAME (coding_type)));
10230 ASET (attrs, coding_attr_category, make_number (category));
10231 ASET (attrs, coding_attr_plist,
10232 Fcons (QCcategory,
10233 Fcons (AREF (Vcoding_category_table, category),
10234 CODING_ATTR_PLIST (attrs))));
10235 ASET (attrs, coding_attr_plist,
10236 Fcons (QCascii_compatible_p,
10237 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10238 CODING_ATTR_PLIST (attrs))));
10240 eol_type = args[coding_arg_eol_type];
10241 if (! NILP (eol_type)
10242 && ! EQ (eol_type, Qunix)
10243 && ! EQ (eol_type, Qdos)
10244 && ! EQ (eol_type, Qmac))
10245 error ("Invalid eol-type");
10247 aliases = Fcons (name, Qnil);
10249 if (NILP (eol_type))
10251 eol_type = make_subsidiaries (name);
10252 for (i = 0; i < 3; i++)
10254 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10256 this_name = AREF (eol_type, i);
10257 this_aliases = Fcons (this_name, Qnil);
10258 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10259 this_spec = make_uninit_vector (3);
10260 ASET (this_spec, 0, attrs);
10261 ASET (this_spec, 1, this_aliases);
10262 ASET (this_spec, 2, this_eol_type);
10263 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10264 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10265 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10266 if (NILP (val))
10267 Vcoding_system_alist
10268 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10269 Vcoding_system_alist);
10273 spec_vec = make_uninit_vector (3);
10274 ASET (spec_vec, 0, attrs);
10275 ASET (spec_vec, 1, aliases);
10276 ASET (spec_vec, 2, eol_type);
10278 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10279 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10280 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10281 if (NILP (val))
10282 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10283 Vcoding_system_alist);
10286 int id = coding_categories[category].id;
10288 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10289 setup_coding_system (name, &coding_categories[category]);
10292 return Qnil;
10294 short_args:
10295 return Fsignal (Qwrong_number_of_arguments,
10296 Fcons (intern ("define-coding-system-internal"),
10297 make_number (nargs)));
10301 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10302 3, 3, 0,
10303 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10304 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10306 Lisp_Object spec, attrs;
10308 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10309 attrs = AREF (spec, 0);
10310 if (EQ (prop, QCmnemonic))
10312 if (! STRINGP (val))
10313 CHECK_CHARACTER (val);
10314 ASET (attrs, coding_attr_mnemonic, val);
10316 else if (EQ (prop, QCdefault_char))
10318 if (NILP (val))
10319 val = make_number (' ');
10320 else
10321 CHECK_CHARACTER (val);
10322 ASET (attrs, coding_attr_default_char, val);
10324 else if (EQ (prop, QCdecode_translation_table))
10326 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10327 CHECK_SYMBOL (val);
10328 ASET (attrs, coding_attr_decode_tbl, val);
10330 else if (EQ (prop, QCencode_translation_table))
10332 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10333 CHECK_SYMBOL (val);
10334 ASET (attrs, coding_attr_encode_tbl, val);
10336 else if (EQ (prop, QCpost_read_conversion))
10338 CHECK_SYMBOL (val);
10339 ASET (attrs, coding_attr_post_read, val);
10341 else if (EQ (prop, QCpre_write_conversion))
10343 CHECK_SYMBOL (val);
10344 ASET (attrs, coding_attr_pre_write, val);
10346 else if (EQ (prop, QCascii_compatible_p))
10348 ASET (attrs, coding_attr_ascii_compat, val);
10351 ASET (attrs, coding_attr_plist,
10352 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10353 return val;
10357 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10358 Sdefine_coding_system_alias, 2, 2, 0,
10359 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10360 (Lisp_Object alias, Lisp_Object coding_system)
10362 Lisp_Object spec, aliases, eol_type, val;
10364 CHECK_SYMBOL (alias);
10365 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10366 aliases = AREF (spec, 1);
10367 /* ALIASES should be a list of length more than zero, and the first
10368 element is a base coding system. Append ALIAS at the tail of the
10369 list. */
10370 while (!NILP (XCDR (aliases)))
10371 aliases = XCDR (aliases);
10372 XSETCDR (aliases, Fcons (alias, Qnil));
10374 eol_type = AREF (spec, 2);
10375 if (VECTORP (eol_type))
10377 Lisp_Object subsidiaries;
10378 int i;
10380 subsidiaries = make_subsidiaries (alias);
10381 for (i = 0; i < 3; i++)
10382 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10383 AREF (eol_type, i));
10386 Fputhash (alias, spec, Vcoding_system_hash_table);
10387 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10388 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10389 if (NILP (val))
10390 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10391 Vcoding_system_alist);
10393 return Qnil;
10396 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10397 1, 1, 0,
10398 doc: /* Return the base of CODING-SYSTEM.
10399 Any alias or subsidiary coding system is not a base coding system. */)
10400 (Lisp_Object coding_system)
10402 Lisp_Object spec, attrs;
10404 if (NILP (coding_system))
10405 return (Qno_conversion);
10406 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10407 attrs = AREF (spec, 0);
10408 return CODING_ATTR_BASE_NAME (attrs);
10411 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10412 1, 1, 0,
10413 doc: "Return the property list of CODING-SYSTEM.")
10414 (Lisp_Object coding_system)
10416 Lisp_Object spec, attrs;
10418 if (NILP (coding_system))
10419 coding_system = Qno_conversion;
10420 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10421 attrs = AREF (spec, 0);
10422 return CODING_ATTR_PLIST (attrs);
10426 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10427 1, 1, 0,
10428 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10429 (Lisp_Object coding_system)
10431 Lisp_Object spec;
10433 if (NILP (coding_system))
10434 coding_system = Qno_conversion;
10435 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10436 return AREF (spec, 1);
10439 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10440 Scoding_system_eol_type, 1, 1, 0,
10441 doc: /* Return eol-type of CODING-SYSTEM.
10442 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10444 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10445 and CR respectively.
10447 A vector value indicates that a format of end-of-line should be
10448 detected automatically. Nth element of the vector is the subsidiary
10449 coding system whose eol-type is N. */)
10450 (Lisp_Object coding_system)
10452 Lisp_Object spec, eol_type;
10453 int n;
10455 if (NILP (coding_system))
10456 coding_system = Qno_conversion;
10457 if (! CODING_SYSTEM_P (coding_system))
10458 return Qnil;
10459 spec = CODING_SYSTEM_SPEC (coding_system);
10460 eol_type = AREF (spec, 2);
10461 if (VECTORP (eol_type))
10462 return Fcopy_sequence (eol_type);
10463 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10464 return make_number (n);
10467 #endif /* emacs */
10470 /*** 9. Post-amble ***/
10472 void
10473 init_coding_once (void)
10475 int i;
10477 for (i = 0; i < coding_category_max; i++)
10479 coding_categories[i].id = -1;
10480 coding_priorities[i] = i;
10483 /* ISO2022 specific initialize routine. */
10484 for (i = 0; i < 0x20; i++)
10485 iso_code_class[i] = ISO_control_0;
10486 for (i = 0x21; i < 0x7F; i++)
10487 iso_code_class[i] = ISO_graphic_plane_0;
10488 for (i = 0x80; i < 0xA0; i++)
10489 iso_code_class[i] = ISO_control_1;
10490 for (i = 0xA1; i < 0xFF; i++)
10491 iso_code_class[i] = ISO_graphic_plane_1;
10492 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10493 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10494 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10495 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10496 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10497 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10498 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10499 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10500 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10502 for (i = 0; i < 256; i++)
10504 emacs_mule_bytes[i] = 1;
10506 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10507 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10508 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10509 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10512 #ifdef emacs
10514 void
10515 syms_of_coding (void)
10517 staticpro (&Vcoding_system_hash_table);
10519 Lisp_Object args[2];
10520 args[0] = QCtest;
10521 args[1] = Qeq;
10522 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10525 staticpro (&Vsjis_coding_system);
10526 Vsjis_coding_system = Qnil;
10528 staticpro (&Vbig5_coding_system);
10529 Vbig5_coding_system = Qnil;
10531 staticpro (&Vcode_conversion_reused_workbuf);
10532 Vcode_conversion_reused_workbuf = Qnil;
10534 staticpro (&Vcode_conversion_workbuf_name);
10535 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10537 reused_workbuf_in_use = 0;
10539 DEFSYM (Qcharset, "charset");
10540 DEFSYM (Qtarget_idx, "target-idx");
10541 DEFSYM (Qcoding_system_history, "coding-system-history");
10542 Fset (Qcoding_system_history, Qnil);
10544 /* Target FILENAME is the first argument. */
10545 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10546 /* Target FILENAME is the third argument. */
10547 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10549 DEFSYM (Qcall_process, "call-process");
10550 /* Target PROGRAM is the first argument. */
10551 Fput (Qcall_process, Qtarget_idx, make_number (0));
10553 DEFSYM (Qcall_process_region, "call-process-region");
10554 /* Target PROGRAM is the third argument. */
10555 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10557 DEFSYM (Qstart_process, "start-process");
10558 /* Target PROGRAM is the third argument. */
10559 Fput (Qstart_process, Qtarget_idx, make_number (2));
10561 DEFSYM (Qopen_network_stream, "open-network-stream");
10562 /* Target SERVICE is the fourth argument. */
10563 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10565 DEFSYM (Qcoding_system, "coding-system");
10566 DEFSYM (Qcoding_aliases, "coding-aliases");
10568 DEFSYM (Qeol_type, "eol-type");
10569 DEFSYM (Qunix, "unix");
10570 DEFSYM (Qdos, "dos");
10571 DEFSYM (Qmac, "mac");
10573 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10574 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10575 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10576 DEFSYM (Qdefault_char, "default-char");
10577 DEFSYM (Qundecided, "undecided");
10578 DEFSYM (Qno_conversion, "no-conversion");
10579 DEFSYM (Qraw_text, "raw-text");
10581 DEFSYM (Qiso_2022, "iso-2022");
10583 DEFSYM (Qutf_8, "utf-8");
10584 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10586 #if defined (WINDOWSNT) || defined (CYGWIN)
10587 /* No, not utf-16-le: that one has a BOM. */
10588 DEFSYM (Qutf_16le, "utf-16le");
10589 #endif
10591 DEFSYM (Qutf_16, "utf-16");
10592 DEFSYM (Qbig, "big");
10593 DEFSYM (Qlittle, "little");
10595 DEFSYM (Qshift_jis, "shift-jis");
10596 DEFSYM (Qbig5, "big5");
10598 DEFSYM (Qcoding_system_p, "coding-system-p");
10600 DEFSYM (Qcoding_system_error, "coding-system-error");
10601 Fput (Qcoding_system_error, Qerror_conditions,
10602 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10603 Fput (Qcoding_system_error, Qerror_message,
10604 build_pure_c_string ("Invalid coding system"));
10606 /* Intern this now in case it isn't already done.
10607 Setting this variable twice is harmless.
10608 But don't staticpro it here--that is done in alloc.c. */
10609 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10611 DEFSYM (Qtranslation_table, "translation-table");
10612 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10613 DEFSYM (Qtranslation_table_id, "translation-table-id");
10614 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10615 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10617 DEFSYM (Qvalid_codes, "valid-codes");
10619 DEFSYM (Qemacs_mule, "emacs-mule");
10621 DEFSYM (QCcategory, ":category");
10622 DEFSYM (QCmnemonic, ":mnemonic");
10623 DEFSYM (QCdefault_char, ":default-char");
10624 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10625 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10626 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10627 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10628 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10630 Vcoding_category_table
10631 = Fmake_vector (make_number (coding_category_max), Qnil);
10632 staticpro (&Vcoding_category_table);
10633 /* Followings are target of code detection. */
10634 ASET (Vcoding_category_table, coding_category_iso_7,
10635 intern_c_string ("coding-category-iso-7"));
10636 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10637 intern_c_string ("coding-category-iso-7-tight"));
10638 ASET (Vcoding_category_table, coding_category_iso_8_1,
10639 intern_c_string ("coding-category-iso-8-1"));
10640 ASET (Vcoding_category_table, coding_category_iso_8_2,
10641 intern_c_string ("coding-category-iso-8-2"));
10642 ASET (Vcoding_category_table, coding_category_iso_7_else,
10643 intern_c_string ("coding-category-iso-7-else"));
10644 ASET (Vcoding_category_table, coding_category_iso_8_else,
10645 intern_c_string ("coding-category-iso-8-else"));
10646 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10647 intern_c_string ("coding-category-utf-8-auto"));
10648 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10649 intern_c_string ("coding-category-utf-8"));
10650 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10651 intern_c_string ("coding-category-utf-8-sig"));
10652 ASET (Vcoding_category_table, coding_category_utf_16_be,
10653 intern_c_string ("coding-category-utf-16-be"));
10654 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10655 intern_c_string ("coding-category-utf-16-auto"));
10656 ASET (Vcoding_category_table, coding_category_utf_16_le,
10657 intern_c_string ("coding-category-utf-16-le"));
10658 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10659 intern_c_string ("coding-category-utf-16-be-nosig"));
10660 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10661 intern_c_string ("coding-category-utf-16-le-nosig"));
10662 ASET (Vcoding_category_table, coding_category_charset,
10663 intern_c_string ("coding-category-charset"));
10664 ASET (Vcoding_category_table, coding_category_sjis,
10665 intern_c_string ("coding-category-sjis"));
10666 ASET (Vcoding_category_table, coding_category_big5,
10667 intern_c_string ("coding-category-big5"));
10668 ASET (Vcoding_category_table, coding_category_ccl,
10669 intern_c_string ("coding-category-ccl"));
10670 ASET (Vcoding_category_table, coding_category_emacs_mule,
10671 intern_c_string ("coding-category-emacs-mule"));
10672 /* Followings are NOT target of code detection. */
10673 ASET (Vcoding_category_table, coding_category_raw_text,
10674 intern_c_string ("coding-category-raw-text"));
10675 ASET (Vcoding_category_table, coding_category_undecided,
10676 intern_c_string ("coding-category-undecided"));
10678 DEFSYM (Qinsufficient_source, "insufficient-source");
10679 DEFSYM (Qinvalid_source, "invalid-source");
10680 DEFSYM (Qinterrupted, "interrupted");
10681 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10683 defsubr (&Scoding_system_p);
10684 defsubr (&Sread_coding_system);
10685 defsubr (&Sread_non_nil_coding_system);
10686 defsubr (&Scheck_coding_system);
10687 defsubr (&Sdetect_coding_region);
10688 defsubr (&Sdetect_coding_string);
10689 defsubr (&Sfind_coding_systems_region_internal);
10690 defsubr (&Sunencodable_char_position);
10691 defsubr (&Scheck_coding_systems_region);
10692 defsubr (&Sdecode_coding_region);
10693 defsubr (&Sencode_coding_region);
10694 defsubr (&Sdecode_coding_string);
10695 defsubr (&Sencode_coding_string);
10696 defsubr (&Sdecode_sjis_char);
10697 defsubr (&Sencode_sjis_char);
10698 defsubr (&Sdecode_big5_char);
10699 defsubr (&Sencode_big5_char);
10700 defsubr (&Sset_terminal_coding_system_internal);
10701 defsubr (&Sset_safe_terminal_coding_system_internal);
10702 defsubr (&Sterminal_coding_system);
10703 defsubr (&Sset_keyboard_coding_system_internal);
10704 defsubr (&Skeyboard_coding_system);
10705 defsubr (&Sfind_operation_coding_system);
10706 defsubr (&Sset_coding_system_priority);
10707 defsubr (&Sdefine_coding_system_internal);
10708 defsubr (&Sdefine_coding_system_alias);
10709 defsubr (&Scoding_system_put);
10710 defsubr (&Scoding_system_base);
10711 defsubr (&Scoding_system_plist);
10712 defsubr (&Scoding_system_aliases);
10713 defsubr (&Scoding_system_eol_type);
10714 defsubr (&Scoding_system_priority_list);
10716 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10717 doc: /* List of coding systems.
10719 Do not alter the value of this variable manually. This variable should be
10720 updated by the functions `define-coding-system' and
10721 `define-coding-system-alias'. */);
10722 Vcoding_system_list = Qnil;
10724 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10725 doc: /* Alist of coding system names.
10726 Each element is one element list of coding system name.
10727 This variable is given to `completing-read' as COLLECTION argument.
10729 Do not alter the value of this variable manually. This variable should be
10730 updated by the functions `make-coding-system' and
10731 `define-coding-system-alias'. */);
10732 Vcoding_system_alist = Qnil;
10734 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10735 doc: /* List of coding-categories (symbols) ordered by priority.
10737 On detecting a coding system, Emacs tries code detection algorithms
10738 associated with each coding-category one by one in this order. When
10739 one algorithm agrees with a byte sequence of source text, the coding
10740 system bound to the corresponding coding-category is selected.
10742 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10744 int i;
10746 Vcoding_category_list = Qnil;
10747 for (i = coding_category_max - 1; i >= 0; i--)
10748 Vcoding_category_list
10749 = Fcons (AREF (Vcoding_category_table, i),
10750 Vcoding_category_list);
10753 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10754 doc: /* Specify the coding system for read operations.
10755 It is useful to bind this variable with `let', but do not set it globally.
10756 If the value is a coding system, it is used for decoding on read operation.
10757 If not, an appropriate element is used from one of the coding system alists.
10758 There are three such tables: `file-coding-system-alist',
10759 `process-coding-system-alist', and `network-coding-system-alist'. */);
10760 Vcoding_system_for_read = Qnil;
10762 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10763 doc: /* Specify the coding system for write operations.
10764 Programs bind this variable with `let', but you should not set it globally.
10765 If the value is a coding system, it is used for encoding of output,
10766 when writing it to a file and when sending it to a file or subprocess.
10768 If this does not specify a coding system, an appropriate element
10769 is used from one of the coding system alists.
10770 There are three such tables: `file-coding-system-alist',
10771 `process-coding-system-alist', and `network-coding-system-alist'.
10772 For output to files, if the above procedure does not specify a coding system,
10773 the value of `buffer-file-coding-system' is used. */);
10774 Vcoding_system_for_write = Qnil;
10776 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10777 doc: /*
10778 Coding system used in the latest file or process I/O. */);
10779 Vlast_coding_system_used = Qnil;
10781 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10782 doc: /*
10783 Error status of the last code conversion.
10785 When an error was detected in the last code conversion, this variable
10786 is set to one of the following symbols.
10787 `insufficient-source'
10788 `inconsistent-eol'
10789 `invalid-source'
10790 `interrupted'
10791 `insufficient-memory'
10792 When no error was detected, the value doesn't change. So, to check
10793 the error status of a code conversion by this variable, you must
10794 explicitly set this variable to nil before performing code
10795 conversion. */);
10796 Vlast_code_conversion_error = Qnil;
10798 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10799 doc: /*
10800 *Non-nil means always inhibit code conversion of end-of-line format.
10801 See info node `Coding Systems' and info node `Text and Binary' concerning
10802 such conversion. */);
10803 inhibit_eol_conversion = 0;
10805 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10806 doc: /*
10807 Non-nil means process buffer inherits coding system of process output.
10808 Bind it to t if the process output is to be treated as if it were a file
10809 read from some filesystem. */);
10810 inherit_process_coding_system = 0;
10812 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10813 doc: /*
10814 Alist to decide a coding system to use for a file I/O operation.
10815 The format is ((PATTERN . VAL) ...),
10816 where PATTERN is a regular expression matching a file name,
10817 VAL is a coding system, a cons of coding systems, or a function symbol.
10818 If VAL is a coding system, it is used for both decoding and encoding
10819 the file contents.
10820 If VAL is a cons of coding systems, the car part is used for decoding,
10821 and the cdr part is used for encoding.
10822 If VAL is a function symbol, the function must return a coding system
10823 or a cons of coding systems which are used as above. The function is
10824 called with an argument that is a list of the arguments with which
10825 `find-operation-coding-system' was called. If the function can't decide
10826 a coding system, it can return `undecided' so that the normal
10827 code-detection is performed.
10829 See also the function `find-operation-coding-system'
10830 and the variable `auto-coding-alist'. */);
10831 Vfile_coding_system_alist = Qnil;
10833 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10834 doc: /*
10835 Alist to decide a coding system to use for a process I/O operation.
10836 The format is ((PATTERN . VAL) ...),
10837 where PATTERN is a regular expression matching a program name,
10838 VAL is a coding system, a cons of coding systems, or a function symbol.
10839 If VAL is a coding system, it is used for both decoding what received
10840 from the program and encoding what sent to the program.
10841 If VAL is a cons of coding systems, the car part is used for decoding,
10842 and the cdr part is used for encoding.
10843 If VAL is a function symbol, the function must return a coding system
10844 or a cons of coding systems which are used as above.
10846 See also the function `find-operation-coding-system'. */);
10847 Vprocess_coding_system_alist = Qnil;
10849 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10850 doc: /*
10851 Alist to decide a coding system to use for a network I/O operation.
10852 The format is ((PATTERN . VAL) ...),
10853 where PATTERN is a regular expression matching a network service name
10854 or is a port number to connect to,
10855 VAL is a coding system, a cons of coding systems, or a function symbol.
10856 If VAL is a coding system, it is used for both decoding what received
10857 from the network stream and encoding what sent to the network stream.
10858 If VAL is a cons of coding systems, the car part is used for decoding,
10859 and the cdr part is used for encoding.
10860 If VAL is a function symbol, the function must return a coding system
10861 or a cons of coding systems which are used as above.
10863 See also the function `find-operation-coding-system'. */);
10864 Vnetwork_coding_system_alist = Qnil;
10866 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10867 doc: /* Coding system to use with system messages.
10868 Also used for decoding keyboard input on X Window system. */);
10869 Vlocale_coding_system = Qnil;
10871 /* The eol mnemonics are reset in startup.el system-dependently. */
10872 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10873 doc: /*
10874 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10875 eol_mnemonic_unix = build_pure_c_string (":");
10877 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10878 doc: /*
10879 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10880 eol_mnemonic_dos = build_pure_c_string ("\\");
10882 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10883 doc: /*
10884 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10885 eol_mnemonic_mac = build_pure_c_string ("/");
10887 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10888 doc: /*
10889 *String displayed in mode line when end-of-line format is not yet determined. */);
10890 eol_mnemonic_undecided = build_pure_c_string (":");
10892 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10893 doc: /*
10894 *Non-nil enables character translation while encoding and decoding. */);
10895 Venable_character_translation = Qt;
10897 DEFVAR_LISP ("standard-translation-table-for-decode",
10898 Vstandard_translation_table_for_decode,
10899 doc: /* Table for translating characters while decoding. */);
10900 Vstandard_translation_table_for_decode = Qnil;
10902 DEFVAR_LISP ("standard-translation-table-for-encode",
10903 Vstandard_translation_table_for_encode,
10904 doc: /* Table for translating characters while encoding. */);
10905 Vstandard_translation_table_for_encode = Qnil;
10907 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10908 doc: /* Alist of charsets vs revision numbers.
10909 While encoding, if a charset (car part of an element) is found,
10910 designate it with the escape sequence identifying revision (cdr part
10911 of the element). */);
10912 Vcharset_revision_table = Qnil;
10914 DEFVAR_LISP ("default-process-coding-system",
10915 Vdefault_process_coding_system,
10916 doc: /* Cons of coding systems used for process I/O by default.
10917 The car part is used for decoding a process output,
10918 the cdr part is used for encoding a text to be sent to a process. */);
10919 Vdefault_process_coding_system = Qnil;
10921 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10922 doc: /*
10923 Table of extra Latin codes in the range 128..159 (inclusive).
10924 This is a vector of length 256.
10925 If Nth element is non-nil, the existence of code N in a file
10926 \(or output of subprocess) doesn't prevent it to be detected as
10927 a coding system of ISO 2022 variant which has a flag
10928 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10929 or reading output of a subprocess.
10930 Only 128th through 159th elements have a meaning. */);
10931 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10933 DEFVAR_LISP ("select-safe-coding-system-function",
10934 Vselect_safe_coding_system_function,
10935 doc: /*
10936 Function to call to select safe coding system for encoding a text.
10938 If set, this function is called to force a user to select a proper
10939 coding system which can encode the text in the case that a default
10940 coding system used in each operation can't encode the text. The
10941 function should take care that the buffer is not modified while
10942 the coding system is being selected.
10944 The default value is `select-safe-coding-system' (which see). */);
10945 Vselect_safe_coding_system_function = Qnil;
10947 DEFVAR_BOOL ("coding-system-require-warning",
10948 coding_system_require_warning,
10949 doc: /* Internal use only.
10950 If non-nil, on writing a file, `select-safe-coding-system-function' is
10951 called even if `coding-system-for-write' is non-nil. The command
10952 `universal-coding-system-argument' binds this variable to t temporarily. */);
10953 coding_system_require_warning = 0;
10956 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10957 inhibit_iso_escape_detection,
10958 doc: /*
10959 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10961 When Emacs reads text, it tries to detect how the text is encoded.
10962 This code detection is sensitive to escape sequences. If Emacs sees
10963 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10964 of the ISO2022 encodings, and decodes text by the corresponding coding
10965 system (e.g. `iso-2022-7bit').
10967 However, there may be a case that you want to read escape sequences in
10968 a file as is. In such a case, you can set this variable to non-nil.
10969 Then the code detection will ignore any escape sequences, and no text is
10970 detected as encoded in some ISO-2022 encoding. The result is that all
10971 escape sequences become visible in a buffer.
10973 The default value is nil, and it is strongly recommended not to change
10974 it. That is because many Emacs Lisp source files that contain
10975 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10976 in Emacs's distribution, and they won't be decoded correctly on
10977 reading if you suppress escape sequence detection.
10979 The other way to read escape sequences in a file without decoding is
10980 to explicitly specify some coding system that doesn't use ISO-2022
10981 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
10982 inhibit_iso_escape_detection = 0;
10984 DEFVAR_BOOL ("inhibit-null-byte-detection",
10985 inhibit_null_byte_detection,
10986 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10987 By default, Emacs treats it as binary data, and does not attempt to
10988 decode it. The effect is as if you specified `no-conversion' for
10989 reading that text.
10991 Set this to non-nil when a regular text happens to include null bytes.
10992 Examples are Index nodes of Info files and null-byte delimited output
10993 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10994 decode text as usual. */);
10995 inhibit_null_byte_detection = 0;
10997 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
10998 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
10999 Internal use only. Removed after the experimental optimizer gets stable. */);
11000 disable_ascii_optimization = 0;
11002 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11003 doc: /* Char table for translating self-inserting characters.
11004 This is applied to the result of input methods, not their input.
11005 See also `keyboard-translate-table'.
11007 Use of this variable for character code unification was rendered
11008 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11009 internal character representation. */);
11010 Vtranslation_table_for_input = Qnil;
11013 Lisp_Object args[coding_arg_max];
11014 Lisp_Object plist[16];
11015 int i;
11017 for (i = 0; i < coding_arg_max; i++)
11018 args[i] = Qnil;
11020 plist[0] = intern_c_string (":name");
11021 plist[1] = args[coding_arg_name] = Qno_conversion;
11022 plist[2] = intern_c_string (":mnemonic");
11023 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11024 plist[4] = intern_c_string (":coding-type");
11025 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11026 plist[6] = intern_c_string (":ascii-compatible-p");
11027 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11028 plist[8] = intern_c_string (":default-char");
11029 plist[9] = args[coding_arg_default_char] = make_number (0);
11030 plist[10] = intern_c_string (":for-unibyte");
11031 plist[11] = args[coding_arg_for_unibyte] = Qt;
11032 plist[12] = intern_c_string (":docstring");
11033 plist[13] = build_pure_c_string ("Do no conversion.\n\
11035 When you visit a file with this coding, the file is read into a\n\
11036 unibyte buffer as is, thus each byte of a file is treated as a\n\
11037 character.");
11038 plist[14] = intern_c_string (":eol-type");
11039 plist[15] = args[coding_arg_eol_type] = Qunix;
11040 args[coding_arg_plist] = Flist (16, plist);
11041 Fdefine_coding_system_internal (coding_arg_max, args);
11043 plist[1] = args[coding_arg_name] = Qundecided;
11044 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11045 plist[5] = args[coding_arg_coding_type] = Qundecided;
11046 /* This is already set.
11047 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11048 plist[8] = intern_c_string (":charset-list");
11049 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11050 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11051 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11052 plist[15] = args[coding_arg_eol_type] = Qnil;
11053 args[coding_arg_plist] = Flist (16, plist);
11054 Fdefine_coding_system_internal (coding_arg_max, args);
11057 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11060 int i;
11062 for (i = 0; i < coding_category_max; i++)
11063 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11065 #if defined (DOS_NT)
11066 system_eol_type = Qdos;
11067 #else
11068 system_eol_type = Qunix;
11069 #endif
11070 staticpro (&system_eol_type);
11073 char *
11074 emacs_strerror (int error_number)
11076 char *str;
11078 synchronize_system_messages_locale ();
11079 str = strerror (error_number);
11081 if (! NILP (Vlocale_coding_system))
11083 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11084 Vlocale_coding_system,
11086 str = SSDATA (dec);
11089 return str;
11092 #endif /* emacs */