Fix bug #13515 with processing DBCS file names on MS-Windows.
[emacs.git] / src / coding.c
blob20d5d92f0467ad2cfec973289bdb4f13fe56cac9
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #include "lisp.h"
290 #include "character.h"
291 #include "buffer.h"
292 #include "charset.h"
293 #include "ccl.h"
294 #include "composite.h"
295 #include "coding.h"
296 #include "window.h"
297 #include "frame.h"
298 #include "termhooks.h"
300 Lisp_Object Vcoding_system_hash_table;
302 static Lisp_Object Qcoding_system, Qeol_type;
303 static Lisp_Object Qcoding_aliases;
304 Lisp_Object Qunix, Qdos;
305 Lisp_Object Qbuffer_file_coding_system;
306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
307 static Lisp_Object Qdefault_char;
308 Lisp_Object Qno_conversion, Qundecided;
309 Lisp_Object Qcharset, Qutf_8;
310 static Lisp_Object Qiso_2022;
311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
312 static Lisp_Object Qbig, Qlittle;
313 static Lisp_Object Qcoding_system_history;
314 static Lisp_Object Qvalid_codes;
315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
318 static Lisp_Object QCascii_compatible_p;
320 Lisp_Object Qcall_process, Qcall_process_region;
321 Lisp_Object Qstart_process, Qopen_network_stream;
322 static Lisp_Object Qtarget_idx;
324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
327 /* If a symbol has this property, evaluate the value to define the
328 symbol as a coding system. */
329 static Lisp_Object Qcoding_system_define_form;
331 /* Format of end-of-line decided by system. This is Qunix on
332 Unix and Mac, Qdos on DOS/Windows.
333 This has an effect only for external encoding (i.e. for output to
334 file and process), not for in-buffer or Lisp string encoding. */
335 static Lisp_Object system_eol_type;
337 #ifdef emacs
339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341 /* Coding system emacs-mule and raw-text are for converting only
342 end-of-line format. */
343 Lisp_Object Qemacs_mule, Qraw_text;
344 Lisp_Object Qutf_8_emacs;
346 #if defined (WINDOWSNT) || defined (CYGWIN)
347 static Lisp_Object Qutf_16le;
348 #endif
350 /* Coding-systems are handed between Emacs Lisp programs and C internal
351 routines by the following three variables. */
352 /* Coding system to be used to encode text for terminal display when
353 terminal coding system is nil. */
354 struct coding_system safe_terminal_coding;
356 #endif /* emacs */
358 Lisp_Object Qtranslation_table;
359 Lisp_Object Qtranslation_table_id;
360 static Lisp_Object Qtranslation_table_for_decode;
361 static Lisp_Object Qtranslation_table_for_encode;
363 /* Two special coding systems. */
364 static Lisp_Object Vsjis_coding_system;
365 static Lisp_Object Vbig5_coding_system;
367 /* ISO2022 section */
369 #define CODING_ISO_INITIAL(coding, reg) \
370 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
371 coding_attr_iso_initial), \
372 reg)))
375 #define CODING_ISO_REQUEST(coding, charset_id) \
376 (((charset_id) <= (coding)->max_charset_id \
377 ? ((coding)->safe_charsets[charset_id] != 255 \
378 ? (coding)->safe_charsets[charset_id] \
379 : -1) \
380 : -1))
383 #define CODING_ISO_FLAGS(coding) \
384 ((coding)->spec.iso_2022.flags)
385 #define CODING_ISO_DESIGNATION(coding, reg) \
386 ((coding)->spec.iso_2022.current_designation[reg])
387 #define CODING_ISO_INVOCATION(coding, plane) \
388 ((coding)->spec.iso_2022.current_invocation[plane])
389 #define CODING_ISO_SINGLE_SHIFTING(coding) \
390 ((coding)->spec.iso_2022.single_shifting)
391 #define CODING_ISO_BOL(coding) \
392 ((coding)->spec.iso_2022.bol)
393 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
394 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
395 #define CODING_ISO_CMP_STATUS(coding) \
396 (&(coding)->spec.iso_2022.cmp_status)
397 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
398 ((coding)->spec.iso_2022.ctext_extended_segment_len)
399 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
400 ((coding)->spec.iso_2022.embedded_utf_8)
402 /* Control characters of ISO2022. */
403 /* code */ /* function */
404 #define ISO_CODE_SO 0x0E /* shift-out */
405 #define ISO_CODE_SI 0x0F /* shift-in */
406 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
407 #define ISO_CODE_ESC 0x1B /* escape */
408 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
409 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
410 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
412 /* All code (1-byte) of ISO2022 is classified into one of the
413 followings. */
414 enum iso_code_class_type
416 ISO_control_0, /* Control codes in the range
417 0x00..0x1F and 0x7F, except for the
418 following 5 codes. */
419 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
420 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
421 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
422 ISO_escape, /* ISO_CODE_ESC (0x1B) */
423 ISO_control_1, /* Control codes in the range
424 0x80..0x9F, except for the
425 following 3 codes. */
426 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
427 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
428 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
429 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
430 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
431 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
432 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
436 `iso-flags' attribute of an iso2022 coding system. */
438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
439 instead of the correct short-form sequence (e.g. ESC $ A). */
440 #define CODING_ISO_FLAG_LONG_FORM 0x0001
442 /* If set, reset graphic planes and registers at end-of-line to the
443 initial state. */
444 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
446 /* If set, reset graphic planes and registers before any control
447 characters to the initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
450 /* If set, encode by 7-bit environment. */
451 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
453 /* If set, use locking-shift function. */
454 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
456 /* If set, use single-shift function. Overwrite
457 CODING_ISO_FLAG_LOCKING_SHIFT. */
458 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
460 /* If set, use designation escape sequence. */
461 #define CODING_ISO_FLAG_DESIGNATION 0x0040
463 /* If set, produce revision number sequence. */
464 #define CODING_ISO_FLAG_REVISION 0x0080
466 /* If set, produce ISO6429's direction specifying sequence. */
467 #define CODING_ISO_FLAG_DIRECTION 0x0100
469 /* If set, assume designation states are reset at beginning of line on
470 output. */
471 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
473 /* If set, designation sequence should be placed at beginning of line
474 on output. */
475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
477 /* If set, do not encode unsafe characters on output. */
478 #define CODING_ISO_FLAG_SAFE 0x0800
480 /* If set, extra latin codes (128..159) are accepted as a valid code
481 on input. */
482 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
484 #define CODING_ISO_FLAG_COMPOSITION 0x2000
486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
488 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
490 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
492 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
494 /* A character to be produced on output if encoding of the original
495 character is prohibited by CODING_ISO_FLAG_SAFE. */
496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
498 /* UTF-8 section */
499 #define CODING_UTF_8_BOM(coding) \
500 ((coding)->spec.utf_8_bom)
502 /* UTF-16 section */
503 #define CODING_UTF_16_BOM(coding) \
504 ((coding)->spec.utf_16.bom)
506 #define CODING_UTF_16_ENDIAN(coding) \
507 ((coding)->spec.utf_16.endian)
509 #define CODING_UTF_16_SURROGATE(coding) \
510 ((coding)->spec.utf_16.surrogate)
513 /* CCL section */
514 #define CODING_CCL_DECODER(coding) \
515 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
516 #define CODING_CCL_ENCODER(coding) \
517 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
518 #define CODING_CCL_VALIDS(coding) \
519 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
521 /* Index for each coding category in `coding_categories' */
523 enum coding_category
525 coding_category_iso_7,
526 coding_category_iso_7_tight,
527 coding_category_iso_8_1,
528 coding_category_iso_8_2,
529 coding_category_iso_7_else,
530 coding_category_iso_8_else,
531 coding_category_utf_8_auto,
532 coding_category_utf_8_nosig,
533 coding_category_utf_8_sig,
534 coding_category_utf_16_auto,
535 coding_category_utf_16_be,
536 coding_category_utf_16_le,
537 coding_category_utf_16_be_nosig,
538 coding_category_utf_16_le_nosig,
539 coding_category_charset,
540 coding_category_sjis,
541 coding_category_big5,
542 coding_category_ccl,
543 coding_category_emacs_mule,
544 /* All above are targets of code detection. */
545 coding_category_raw_text,
546 coding_category_undecided,
547 coding_category_max
550 /* Definitions of flag bits used in detect_coding_XXXX. */
551 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
552 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
553 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
554 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
555 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
556 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
557 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
558 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
559 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
560 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
561 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
562 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
563 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
564 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
565 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
566 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
567 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
568 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
569 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
570 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
572 /* This value is returned if detect_coding_mask () find nothing other
573 than ASCII characters. */
574 #define CATEGORY_MASK_ANY \
575 (CATEGORY_MASK_ISO_7 \
576 | CATEGORY_MASK_ISO_7_TIGHT \
577 | CATEGORY_MASK_ISO_8_1 \
578 | CATEGORY_MASK_ISO_8_2 \
579 | CATEGORY_MASK_ISO_7_ELSE \
580 | CATEGORY_MASK_ISO_8_ELSE \
581 | CATEGORY_MASK_UTF_8_AUTO \
582 | CATEGORY_MASK_UTF_8_NOSIG \
583 | CATEGORY_MASK_UTF_8_SIG \
584 | CATEGORY_MASK_UTF_16_AUTO \
585 | CATEGORY_MASK_UTF_16_BE \
586 | CATEGORY_MASK_UTF_16_LE \
587 | CATEGORY_MASK_UTF_16_BE_NOSIG \
588 | CATEGORY_MASK_UTF_16_LE_NOSIG \
589 | CATEGORY_MASK_CHARSET \
590 | CATEGORY_MASK_SJIS \
591 | CATEGORY_MASK_BIG5 \
592 | CATEGORY_MASK_CCL \
593 | CATEGORY_MASK_EMACS_MULE)
596 #define CATEGORY_MASK_ISO_7BIT \
597 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
599 #define CATEGORY_MASK_ISO_8BIT \
600 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
602 #define CATEGORY_MASK_ISO_ELSE \
603 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
605 #define CATEGORY_MASK_ISO_ESCAPE \
606 (CATEGORY_MASK_ISO_7 \
607 | CATEGORY_MASK_ISO_7_TIGHT \
608 | CATEGORY_MASK_ISO_7_ELSE \
609 | CATEGORY_MASK_ISO_8_ELSE)
611 #define CATEGORY_MASK_ISO \
612 ( CATEGORY_MASK_ISO_7BIT \
613 | CATEGORY_MASK_ISO_8BIT \
614 | CATEGORY_MASK_ISO_ELSE)
616 #define CATEGORY_MASK_UTF_16 \
617 (CATEGORY_MASK_UTF_16_AUTO \
618 | CATEGORY_MASK_UTF_16_BE \
619 | CATEGORY_MASK_UTF_16_LE \
620 | CATEGORY_MASK_UTF_16_BE_NOSIG \
621 | CATEGORY_MASK_UTF_16_LE_NOSIG)
623 #define CATEGORY_MASK_UTF_8 \
624 (CATEGORY_MASK_UTF_8_AUTO \
625 | CATEGORY_MASK_UTF_8_NOSIG \
626 | CATEGORY_MASK_UTF_8_SIG)
628 /* Table of coding categories (Lisp symbols). This variable is for
629 internal use only. */
630 static Lisp_Object Vcoding_category_table;
632 /* Table of coding-categories ordered by priority. */
633 static enum coding_category coding_priorities[coding_category_max];
635 /* Nth element is a coding context for the coding system bound to the
636 Nth coding category. */
637 static struct coding_system coding_categories[coding_category_max];
639 /*** Commonly used macros and functions ***/
641 #ifndef min
642 #define min(a, b) ((a) < (b) ? (a) : (b))
643 #endif
644 #ifndef max
645 #define max(a, b) ((a) > (b) ? (a) : (b))
646 #endif
648 #define CODING_GET_INFO(coding, attrs, charset_list) \
649 do { \
650 (attrs) = CODING_ID_ATTRS ((coding)->id); \
651 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
652 } while (0)
655 /* Safely get one byte from the source text pointed by SRC which ends
656 at SRC_END, and set C to that byte. If there are not enough bytes
657 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
658 and a multibyte character is found at SRC, set C to the
659 negative value of the character code. The caller should declare
660 and set these variables appropriately in advance:
661 src, src_end, multibytep */
663 #define ONE_MORE_BYTE(c) \
664 do { \
665 if (src == src_end) \
667 if (src_base < src) \
668 record_conversion_result \
669 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
670 goto no_more_source; \
672 c = *src++; \
673 if (multibytep && (c & 0x80)) \
675 if ((c & 0xFE) == 0xC0) \
676 c = ((c & 1) << 6) | *src++; \
677 else \
679 src--; \
680 c = - string_char (src, &src, NULL); \
681 record_conversion_result \
682 (coding, CODING_RESULT_INVALID_SRC); \
685 consumed_chars++; \
686 } while (0)
688 /* Safely get two bytes from the source text pointed by SRC which ends
689 at SRC_END, and set C1 and C2 to those bytes while skipping the
690 heading multibyte characters. If there are not enough bytes in the
691 source, it jumps to 'no_more_source'. If MULTIBYTEP and
692 a multibyte character is found for C2, set C2 to the negative value
693 of the character code. The caller should declare and set these
694 variables appropriately in advance:
695 src, src_end, multibytep
696 It is intended that this macro is used in detect_coding_utf_16. */
698 #define TWO_MORE_BYTES(c1, c2) \
699 do { \
700 do { \
701 if (src == src_end) \
702 goto no_more_source; \
703 c1 = *src++; \
704 if (multibytep && (c1 & 0x80)) \
706 if ((c1 & 0xFE) == 0xC0) \
707 c1 = ((c1 & 1) << 6) | *src++; \
708 else \
710 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
711 c1 = -1; \
714 } while (c1 < 0); \
715 if (src == src_end) \
716 goto no_more_source; \
717 c2 = *src++; \
718 if (multibytep && (c2 & 0x80)) \
720 if ((c2 & 0xFE) == 0xC0) \
721 c2 = ((c2 & 1) << 6) | *src++; \
722 else \
723 c2 = -1; \
725 } while (0)
728 /* Store a byte C in the place pointed by DST and increment DST to the
729 next free point, and increment PRODUCED_CHARS. The caller should
730 assure that C is 0..127, and declare and set the variable `dst'
731 appropriately in advance.
735 #define EMIT_ONE_ASCII_BYTE(c) \
736 do { \
737 produced_chars++; \
738 *dst++ = (c); \
739 } while (0)
742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
744 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
745 do { \
746 produced_chars += 2; \
747 *dst++ = (c1), *dst++ = (c2); \
748 } while (0)
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
753 store in an appropriate multibyte form. The caller should
754 declare and set the variables `dst' and `multibytep' appropriately
755 in advance. */
757 #define EMIT_ONE_BYTE(c) \
758 do { \
759 produced_chars++; \
760 if (multibytep) \
762 unsigned ch = (c); \
763 if (ch >= 0x80) \
764 ch = BYTE8_TO_CHAR (ch); \
765 CHAR_STRING_ADVANCE (ch, dst); \
767 else \
768 *dst++ = (c); \
769 } while (0)
772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
774 #define EMIT_TWO_BYTES(c1, c2) \
775 do { \
776 produced_chars += 2; \
777 if (multibytep) \
779 unsigned ch; \
781 ch = (c1); \
782 if (ch >= 0x80) \
783 ch = BYTE8_TO_CHAR (ch); \
784 CHAR_STRING_ADVANCE (ch, dst); \
785 ch = (c2); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
790 else \
792 *dst++ = (c1); \
793 *dst++ = (c2); \
795 } while (0)
798 #define EMIT_THREE_BYTES(c1, c2, c3) \
799 do { \
800 EMIT_ONE_BYTE (c1); \
801 EMIT_TWO_BYTES (c2, c3); \
802 } while (0)
805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
806 do { \
807 EMIT_TWO_BYTES (c1, c2); \
808 EMIT_TWO_BYTES (c3, c4); \
809 } while (0)
812 static void
813 record_conversion_result (struct coding_system *coding,
814 enum coding_result_code result)
816 coding->result = result;
817 switch (result)
819 case CODING_RESULT_INSUFFICIENT_SRC:
820 Vlast_code_conversion_error = Qinsufficient_source;
821 break;
822 case CODING_RESULT_INCONSISTENT_EOL:
823 Vlast_code_conversion_error = Qinconsistent_eol;
824 break;
825 case CODING_RESULT_INVALID_SRC:
826 Vlast_code_conversion_error = Qinvalid_source;
827 break;
828 case CODING_RESULT_INTERRUPT:
829 Vlast_code_conversion_error = Qinterrupted;
830 break;
831 case CODING_RESULT_INSUFFICIENT_MEM:
832 Vlast_code_conversion_error = Qinsufficient_memory;
833 break;
834 case CODING_RESULT_INSUFFICIENT_DST:
835 /* Don't record this error in Vlast_code_conversion_error
836 because it happens just temporarily and is resolved when the
837 whole conversion is finished. */
838 break;
839 case CODING_RESULT_SUCCESS:
840 break;
841 default:
842 Vlast_code_conversion_error = intern ("Unknown error");
846 /* These wrapper macros are used to preserve validity of pointers into
847 buffer text across calls to decode_char, encode_char, etc, which
848 could cause relocation of buffers if it loads a charset map,
849 because loading a charset map allocates large structures. */
851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
852 do { \
853 ptrdiff_t offset; \
855 charset_map_loaded = 0; \
856 c = DECODE_CHAR (charset, code); \
857 if (charset_map_loaded \
858 && (offset = coding_change_source (coding))) \
860 src += offset; \
861 src_base += offset; \
862 src_end += offset; \
864 } while (0)
866 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
867 do { \
868 ptrdiff_t offset; \
870 charset_map_loaded = 0; \
871 code = ENCODE_CHAR (charset, c); \
872 if (charset_map_loaded \
873 && (offset = coding_change_destination (coding))) \
875 dst += offset; \
876 dst_end += offset; \
878 } while (0)
880 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
881 do { \
882 ptrdiff_t offset; \
884 charset_map_loaded = 0; \
885 charset = char_charset (c, charset_list, code_return); \
886 if (charset_map_loaded \
887 && (offset = coding_change_destination (coding))) \
889 dst += offset; \
890 dst_end += offset; \
892 } while (0)
894 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
895 do { \
896 ptrdiff_t offset; \
898 charset_map_loaded = 0; \
899 result = CHAR_CHARSET_P (c, charset); \
900 if (charset_map_loaded \
901 && (offset = coding_change_destination (coding))) \
903 dst += offset; \
904 dst_end += offset; \
906 } while (0)
909 /* If there are at least BYTES length of room at dst, allocate memory
910 for coding->destination and update dst and dst_end. We don't have
911 to take care of coding->source which will be relocated. It is
912 handled by calling coding_set_source in encode_coding. */
914 #define ASSURE_DESTINATION(bytes) \
915 do { \
916 if (dst + (bytes) >= dst_end) \
918 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
920 dst = alloc_destination (coding, more_bytes, dst); \
921 dst_end = coding->destination + coding->dst_bytes; \
923 } while (0)
926 /* Store multibyte form of the character C in P, and advance P to the
927 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
928 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
929 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
931 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
933 /* Return the character code of character whose multibyte form is at
934 P, and advance P to the end of the multibyte form. This used to be
935 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
936 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
938 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
940 /* Set coding->source from coding->src_object. */
942 static void
943 coding_set_source (struct coding_system *coding)
945 if (BUFFERP (coding->src_object))
947 struct buffer *buf = XBUFFER (coding->src_object);
949 if (coding->src_pos < 0)
950 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
951 else
952 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
954 else if (STRINGP (coding->src_object))
956 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
958 else
960 /* Otherwise, the source is C string and is never relocated
961 automatically. Thus we don't have to update anything. */
966 /* Set coding->source from coding->src_object, and return how many
967 bytes coding->source was changed. */
969 static ptrdiff_t
970 coding_change_source (struct coding_system *coding)
972 const unsigned char *orig = coding->source;
973 coding_set_source (coding);
974 return coding->source - orig;
978 /* Set coding->destination from coding->dst_object. */
980 static void
981 coding_set_destination (struct coding_system *coding)
983 if (BUFFERP (coding->dst_object))
985 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
987 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
988 coding->dst_bytes = (GAP_END_ADDR
989 - (coding->src_bytes - coding->consumed)
990 - coding->destination);
992 else
994 /* We are sure that coding->dst_pos_byte is before the gap
995 of the buffer. */
996 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
997 + coding->dst_pos_byte - BEG_BYTE);
998 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
999 - coding->destination);
1002 else
1004 /* Otherwise, the destination is C string and is never relocated
1005 automatically. Thus we don't have to update anything. */
1010 /* Set coding->destination from coding->dst_object, and return how
1011 many bytes coding->destination was changed. */
1013 static ptrdiff_t
1014 coding_change_destination (struct coding_system *coding)
1016 const unsigned char *orig = coding->destination;
1017 coding_set_destination (coding);
1018 return coding->destination - orig;
1022 static void
1023 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1025 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1026 string_overflow ();
1027 coding->destination = xrealloc (coding->destination,
1028 coding->dst_bytes + bytes);
1029 coding->dst_bytes += bytes;
1032 static void
1033 coding_alloc_by_making_gap (struct coding_system *coding,
1034 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1036 if (EQ (coding->src_object, coding->dst_object))
1038 /* The gap may contain the produced data at the head and not-yet
1039 consumed data at the tail. To preserve those data, we at
1040 first make the gap size to zero, then increase the gap
1041 size. */
1042 ptrdiff_t add = GAP_SIZE;
1044 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1045 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046 make_gap (bytes);
1047 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1050 else
1052 Lisp_Object this_buffer;
1054 this_buffer = Fcurrent_buffer ();
1055 set_buffer_internal (XBUFFER (coding->dst_object));
1056 make_gap (bytes);
1057 set_buffer_internal (XBUFFER (this_buffer));
1062 static unsigned char *
1063 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1064 unsigned char *dst)
1066 ptrdiff_t offset = dst - coding->destination;
1068 if (BUFFERP (coding->dst_object))
1070 struct buffer *buf = XBUFFER (coding->dst_object);
1072 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1074 else
1075 coding_alloc_by_realloc (coding, nbytes);
1076 coding_set_destination (coding);
1077 dst = coding->destination + offset;
1078 return dst;
1081 /** Macros for annotations. */
1083 /* An annotation data is stored in the array coding->charbuf in this
1084 format:
1085 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1086 LENGTH is the number of elements in the annotation.
1087 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1088 NCHARS is the number of characters in the text annotated.
1090 The format of the following elements depend on ANNOTATION_MASK.
1092 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1093 follows:
1094 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1096 NBYTES is the number of bytes specified in the header part of
1097 old-style emacs-mule encoding, or 0 for the other kind of
1098 composition.
1100 METHOD is one of enum composition_method.
1102 Optional COMPOSITION-COMPONENTS are characters and composition
1103 rules.
1105 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1106 follows.
1108 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109 recover from an invalid annotation, and should be skipped by
1110 produce_annotation. */
1112 /* Maximum length of the header of annotation data. */
1113 #define MAX_ANNOTATION_LENGTH 5
1115 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1116 do { \
1117 *(buf)++ = -(len); \
1118 *(buf)++ = (mask); \
1119 *(buf)++ = (nchars); \
1120 coding->annotated = 1; \
1121 } while (0);
1123 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1124 do { \
1125 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1126 *buf++ = nbytes; \
1127 *buf++ = method; \
1128 } while (0)
1131 #define ADD_CHARSET_DATA(buf, nchars, id) \
1132 do { \
1133 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1134 *buf++ = id; \
1135 } while (0)
1138 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1143 /*** 3. UTF-8 ***/
1145 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1146 Return true if a text is encoded in UTF-8. */
1148 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1149 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1150 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1155 #define UTF_8_BOM_1 0xEF
1156 #define UTF_8_BOM_2 0xBB
1157 #define UTF_8_BOM_3 0xBF
1159 static bool
1160 detect_coding_utf_8 (struct coding_system *coding,
1161 struct coding_detection_info *detect_info)
1163 const unsigned char *src = coding->source, *src_base;
1164 const unsigned char *src_end = coding->source + coding->src_bytes;
1165 bool multibytep = coding->src_multibyte;
1166 ptrdiff_t consumed_chars = 0;
1167 bool bom_found = 0;
1168 bool found = 0;
1170 detect_info->checked |= CATEGORY_MASK_UTF_8;
1171 /* A coding system of this category is always ASCII compatible. */
1172 src += coding->head_ascii;
1174 while (1)
1176 int c, c1, c2, c3, c4;
1178 src_base = src;
1179 ONE_MORE_BYTE (c);
1180 if (c < 0 || UTF_8_1_OCTET_P (c))
1181 continue;
1182 ONE_MORE_BYTE (c1);
1183 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1184 break;
1185 if (UTF_8_2_OCTET_LEADING_P (c))
1187 found = 1;
1188 continue;
1190 ONE_MORE_BYTE (c2);
1191 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1192 break;
1193 if (UTF_8_3_OCTET_LEADING_P (c))
1195 found = 1;
1196 if (src_base == coding->source
1197 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1198 bom_found = 1;
1199 continue;
1201 ONE_MORE_BYTE (c3);
1202 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1203 break;
1204 if (UTF_8_4_OCTET_LEADING_P (c))
1206 found = 1;
1207 continue;
1209 ONE_MORE_BYTE (c4);
1210 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1211 break;
1212 if (UTF_8_5_OCTET_LEADING_P (c))
1214 found = 1;
1215 continue;
1217 break;
1219 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220 return 0;
1222 no_more_source:
1223 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1225 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1226 return 0;
1228 if (bom_found)
1230 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1231 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1233 else
1235 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1236 if (found)
1237 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1239 return 1;
1243 static void
1244 decode_coding_utf_8 (struct coding_system *coding)
1246 const unsigned char *src = coding->source + coding->consumed;
1247 const unsigned char *src_end = coding->source + coding->src_bytes;
1248 const unsigned char *src_base;
1249 int *charbuf = coding->charbuf + coding->charbuf_used;
1250 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1251 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1252 bool multibytep = coding->src_multibyte;
1253 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1254 bool eol_dos
1255 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1256 int byte_after_cr = -1;
1258 if (bom != utf_without_bom)
1260 int c1, c2, c3;
1262 src_base = src;
1263 ONE_MORE_BYTE (c1);
1264 if (! UTF_8_3_OCTET_LEADING_P (c1))
1265 src = src_base;
1266 else
1268 ONE_MORE_BYTE (c2);
1269 if (! UTF_8_EXTRA_OCTET_P (c2))
1270 src = src_base;
1271 else
1273 ONE_MORE_BYTE (c3);
1274 if (! UTF_8_EXTRA_OCTET_P (c3))
1275 src = src_base;
1276 else
1278 if ((c1 != UTF_8_BOM_1)
1279 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1280 src = src_base;
1281 else
1282 CODING_UTF_8_BOM (coding) = utf_without_bom;
1287 CODING_UTF_8_BOM (coding) = utf_without_bom;
1289 while (1)
1291 int c, c1, c2, c3, c4, c5;
1293 src_base = src;
1294 consumed_chars_base = consumed_chars;
1296 if (charbuf >= charbuf_end)
1298 if (byte_after_cr >= 0)
1299 src_base--;
1300 break;
1303 if (byte_after_cr >= 0)
1304 c1 = byte_after_cr, byte_after_cr = -1;
1305 else
1306 ONE_MORE_BYTE (c1);
1307 if (c1 < 0)
1309 c = - c1;
1311 else if (UTF_8_1_OCTET_P (c1))
1313 if (eol_dos && c1 == '\r')
1314 ONE_MORE_BYTE (byte_after_cr);
1315 c = c1;
1317 else
1319 ONE_MORE_BYTE (c2);
1320 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1321 goto invalid_code;
1322 if (UTF_8_2_OCTET_LEADING_P (c1))
1324 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1325 /* Reject overlong sequences here and below. Encoders
1326 producing them are incorrect, they can be misleading,
1327 and they mess up read/write invariance. */
1328 if (c < 128)
1329 goto invalid_code;
1331 else
1333 ONE_MORE_BYTE (c3);
1334 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1335 goto invalid_code;
1336 if (UTF_8_3_OCTET_LEADING_P (c1))
1338 c = (((c1 & 0xF) << 12)
1339 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1340 if (c < 0x800
1341 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1342 goto invalid_code;
1344 else
1346 ONE_MORE_BYTE (c4);
1347 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1348 goto invalid_code;
1349 if (UTF_8_4_OCTET_LEADING_P (c1))
1351 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1352 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1353 if (c < 0x10000)
1354 goto invalid_code;
1356 else
1358 ONE_MORE_BYTE (c5);
1359 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1360 goto invalid_code;
1361 if (UTF_8_5_OCTET_LEADING_P (c1))
1363 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1364 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1365 | (c5 & 0x3F));
1366 if ((c > MAX_CHAR) || (c < 0x200000))
1367 goto invalid_code;
1369 else
1370 goto invalid_code;
1376 *charbuf++ = c;
1377 continue;
1379 invalid_code:
1380 src = src_base;
1381 consumed_chars = consumed_chars_base;
1382 ONE_MORE_BYTE (c);
1383 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1384 coding->errors++;
1387 no_more_source:
1388 coding->consumed_char += consumed_chars_base;
1389 coding->consumed = src_base - coding->source;
1390 coding->charbuf_used = charbuf - coding->charbuf;
1394 static bool
1395 encode_coding_utf_8 (struct coding_system *coding)
1397 bool multibytep = coding->dst_multibyte;
1398 int *charbuf = coding->charbuf;
1399 int *charbuf_end = charbuf + coding->charbuf_used;
1400 unsigned char *dst = coding->destination + coding->produced;
1401 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1402 ptrdiff_t produced_chars = 0;
1403 int c;
1405 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1407 ASSURE_DESTINATION (3);
1408 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1409 CODING_UTF_8_BOM (coding) = utf_without_bom;
1412 if (multibytep)
1414 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1416 while (charbuf < charbuf_end)
1418 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1420 ASSURE_DESTINATION (safe_room);
1421 c = *charbuf++;
1422 if (CHAR_BYTE8_P (c))
1424 c = CHAR_TO_BYTE8 (c);
1425 EMIT_ONE_BYTE (c);
1427 else
1429 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1430 for (p = str; p < pend; p++)
1431 EMIT_ONE_BYTE (*p);
1435 else
1437 int safe_room = MAX_MULTIBYTE_LENGTH;
1439 while (charbuf < charbuf_end)
1441 ASSURE_DESTINATION (safe_room);
1442 c = *charbuf++;
1443 if (CHAR_BYTE8_P (c))
1444 *dst++ = CHAR_TO_BYTE8 (c);
1445 else
1446 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1447 produced_chars++;
1450 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1451 coding->produced_char += produced_chars;
1452 coding->produced = dst - coding->destination;
1453 return 0;
1457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1458 Return true if a text is encoded in one of UTF-16 based coding systems. */
1460 #define UTF_16_HIGH_SURROGATE_P(val) \
1461 (((val) & 0xFC00) == 0xD800)
1463 #define UTF_16_LOW_SURROGATE_P(val) \
1464 (((val) & 0xFC00) == 0xDC00)
1467 static bool
1468 detect_coding_utf_16 (struct coding_system *coding,
1469 struct coding_detection_info *detect_info)
1471 const unsigned char *src = coding->source;
1472 const unsigned char *src_end = coding->source + coding->src_bytes;
1473 bool multibytep = coding->src_multibyte;
1474 int c1, c2;
1476 detect_info->checked |= CATEGORY_MASK_UTF_16;
1477 if (coding->mode & CODING_MODE_LAST_BLOCK
1478 && (coding->src_chars & 1))
1480 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1481 return 0;
1484 TWO_MORE_BYTES (c1, c2);
1485 if ((c1 == 0xFF) && (c2 == 0xFE))
1487 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1488 | CATEGORY_MASK_UTF_16_AUTO);
1489 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1493 else if ((c1 == 0xFE) && (c2 == 0xFF))
1495 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1496 | CATEGORY_MASK_UTF_16_AUTO);
1497 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1498 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1501 else if (c2 < 0)
1503 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1504 return 0;
1506 else
1508 /* We check the dispersion of Eth and Oth bytes where E is even and
1509 O is odd. If both are high, we assume binary data.*/
1510 unsigned char e[256], o[256];
1511 unsigned e_num = 1, o_num = 1;
1513 memset (e, 0, 256);
1514 memset (o, 0, 256);
1515 e[c1] = 1;
1516 o[c2] = 1;
1518 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1519 |CATEGORY_MASK_UTF_16_BE
1520 | CATEGORY_MASK_UTF_16_LE);
1522 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1523 != CATEGORY_MASK_UTF_16)
1525 TWO_MORE_BYTES (c1, c2);
1526 if (c2 < 0)
1527 break;
1528 if (! e[c1])
1530 e[c1] = 1;
1531 e_num++;
1532 if (e_num >= 128)
1533 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1535 if (! o[c2])
1537 o[c2] = 1;
1538 o_num++;
1539 if (o_num >= 128)
1540 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1543 return 0;
1546 no_more_source:
1547 return 1;
1550 static void
1551 decode_coding_utf_16 (struct coding_system *coding)
1553 const unsigned char *src = coding->source + coding->consumed;
1554 const unsigned char *src_end = coding->source + coding->src_bytes;
1555 const unsigned char *src_base;
1556 int *charbuf = coding->charbuf + coding->charbuf_used;
1557 /* We may produces at most 3 chars in one loop. */
1558 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1559 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1560 bool multibytep = coding->src_multibyte;
1561 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1562 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1563 int surrogate = CODING_UTF_16_SURROGATE (coding);
1564 bool eol_dos
1565 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1566 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1568 if (bom == utf_with_bom)
1570 int c, c1, c2;
1572 src_base = src;
1573 ONE_MORE_BYTE (c1);
1574 ONE_MORE_BYTE (c2);
1575 c = (c1 << 8) | c2;
1577 if (endian == utf_16_big_endian
1578 ? c != 0xFEFF : c != 0xFFFE)
1580 /* The first two bytes are not BOM. Treat them as bytes
1581 for a normal character. */
1582 src = src_base;
1583 coding->errors++;
1585 CODING_UTF_16_BOM (coding) = utf_without_bom;
1587 else if (bom == utf_detect_bom)
1589 /* We have already tried to detect BOM and failed in
1590 detect_coding. */
1591 CODING_UTF_16_BOM (coding) = utf_without_bom;
1594 while (1)
1596 int c, c1, c2;
1598 src_base = src;
1599 consumed_chars_base = consumed_chars;
1601 if (charbuf >= charbuf_end)
1603 if (byte_after_cr1 >= 0)
1604 src_base -= 2;
1605 break;
1608 if (byte_after_cr1 >= 0)
1609 c1 = byte_after_cr1, byte_after_cr1 = -1;
1610 else
1611 ONE_MORE_BYTE (c1);
1612 if (c1 < 0)
1614 *charbuf++ = -c1;
1615 continue;
1617 if (byte_after_cr2 >= 0)
1618 c2 = byte_after_cr2, byte_after_cr2 = -1;
1619 else
1620 ONE_MORE_BYTE (c2);
1621 if (c2 < 0)
1623 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1624 *charbuf++ = -c2;
1625 continue;
1627 c = (endian == utf_16_big_endian
1628 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1630 if (surrogate)
1632 if (! UTF_16_LOW_SURROGATE_P (c))
1634 if (endian == utf_16_big_endian)
1635 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1636 else
1637 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1638 *charbuf++ = c1;
1639 *charbuf++ = c2;
1640 coding->errors++;
1641 if (UTF_16_HIGH_SURROGATE_P (c))
1642 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1643 else
1644 *charbuf++ = c;
1646 else
1648 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1649 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1650 *charbuf++ = 0x10000 + c;
1653 else
1655 if (UTF_16_HIGH_SURROGATE_P (c))
1656 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1657 else
1659 if (eol_dos && c == '\r')
1661 ONE_MORE_BYTE (byte_after_cr1);
1662 ONE_MORE_BYTE (byte_after_cr2);
1664 *charbuf++ = c;
1669 no_more_source:
1670 coding->consumed_char += consumed_chars_base;
1671 coding->consumed = src_base - coding->source;
1672 coding->charbuf_used = charbuf - coding->charbuf;
1675 static bool
1676 encode_coding_utf_16 (struct coding_system *coding)
1678 bool multibytep = coding->dst_multibyte;
1679 int *charbuf = coding->charbuf;
1680 int *charbuf_end = charbuf + coding->charbuf_used;
1681 unsigned char *dst = coding->destination + coding->produced;
1682 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1683 int safe_room = 8;
1684 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1686 ptrdiff_t produced_chars = 0;
1687 int c;
1689 if (bom != utf_without_bom)
1691 ASSURE_DESTINATION (safe_room);
1692 if (big_endian)
1693 EMIT_TWO_BYTES (0xFE, 0xFF);
1694 else
1695 EMIT_TWO_BYTES (0xFF, 0xFE);
1696 CODING_UTF_16_BOM (coding) = utf_without_bom;
1699 while (charbuf < charbuf_end)
1701 ASSURE_DESTINATION (safe_room);
1702 c = *charbuf++;
1703 if (c > MAX_UNICODE_CHAR)
1704 c = coding->default_char;
1706 if (c < 0x10000)
1708 if (big_endian)
1709 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1710 else
1711 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1713 else
1715 int c1, c2;
1717 c -= 0x10000;
1718 c1 = (c >> 10) + 0xD800;
1719 c2 = (c & 0x3FF) + 0xDC00;
1720 if (big_endian)
1721 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1722 else
1723 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1726 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1727 coding->produced = dst - coding->destination;
1728 coding->produced_char += produced_chars;
1729 return 0;
1733 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1735 /* Emacs' internal format for representation of multiple character
1736 sets is a kind of multi-byte encoding, i.e. characters are
1737 represented by variable-length sequences of one-byte codes.
1739 ASCII characters and control characters (e.g. `tab', `newline') are
1740 represented by one-byte sequences which are their ASCII codes, in
1741 the range 0x00 through 0x7F.
1743 8-bit characters of the range 0x80..0x9F are represented by
1744 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1745 code + 0x20).
1747 8-bit characters of the range 0xA0..0xFF are represented by
1748 one-byte sequences which are their 8-bit code.
1750 The other characters are represented by a sequence of `base
1751 leading-code', optional `extended leading-code', and one or two
1752 `position-code's. The length of the sequence is determined by the
1753 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1754 whereas extended leading-code and position-code take the range 0xA0
1755 through 0xFF. See `charset.h' for more details about leading-code
1756 and position-code.
1758 --- CODE RANGE of Emacs' internal format ---
1759 character set range
1760 ------------- -----
1761 ascii 0x00..0x7F
1762 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763 eight-bit-graphic 0xA0..0xBF
1764 ELSE 0x81..0x9D + [0xA0..0xFF]+
1765 ---------------------------------------------
1767 As this is the internal character representation, the format is
1768 usually not used externally (i.e. in a file or in a data sent to a
1769 process). But, it is possible to have a text externally in this
1770 format (i.e. by encoding by the coding system `emacs-mule').
1772 In that case, a sequence of one-byte codes has a slightly different
1773 form.
1775 At first, all characters in eight-bit-control are represented by
1776 one-byte sequences which are their 8-bit code.
1778 Next, character composition data are represented by the byte
1779 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1780 where,
1781 METHOD is 0xF2 plus one of composition method (enum
1782 composition_method),
1784 BYTES is 0xA0 plus a byte length of this composition data,
1786 CHARS is 0xA0 plus a number of characters composed by this
1787 data,
1789 COMPONENTs are characters of multibyte form or composition
1790 rules encoded by two-byte of ASCII codes.
1792 In addition, for backward compatibility, the following formats are
1793 also recognized as composition data on decoding.
1795 0x80 MSEQ ...
1796 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1798 Here,
1799 MSEQ is a multibyte form but in these special format:
1800 ASCII: 0xA0 ASCII_CODE+0x80,
1801 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802 RULE is a one byte code of the range 0xA0..0xF0 that
1803 represents a composition rule.
1806 char emacs_mule_bytes[256];
1809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1810 Return true if a text is encoded in 'emacs-mule'. */
1812 static bool
1813 detect_coding_emacs_mule (struct coding_system *coding,
1814 struct coding_detection_info *detect_info)
1816 const unsigned char *src = coding->source, *src_base;
1817 const unsigned char *src_end = coding->source + coding->src_bytes;
1818 bool multibytep = coding->src_multibyte;
1819 ptrdiff_t consumed_chars = 0;
1820 int c;
1821 int found = 0;
1823 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1824 /* A coding system of this category is always ASCII compatible. */
1825 src += coding->head_ascii;
1827 while (1)
1829 src_base = src;
1830 ONE_MORE_BYTE (c);
1831 if (c < 0)
1832 continue;
1833 if (c == 0x80)
1835 /* Perhaps the start of composite character. We simply skip
1836 it because analyzing it is too heavy for detecting. But,
1837 at least, we check that the composite character
1838 constitutes of more than 4 bytes. */
1839 const unsigned char *src_start;
1841 repeat:
1842 src_start = src;
1845 ONE_MORE_BYTE (c);
1847 while (c >= 0xA0);
1849 if (src - src_start <= 4)
1850 break;
1851 found = CATEGORY_MASK_EMACS_MULE;
1852 if (c == 0x80)
1853 goto repeat;
1856 if (c < 0x80)
1858 if (c < 0x20
1859 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1860 break;
1862 else
1864 int more_bytes = emacs_mule_bytes[c] - 1;
1866 while (more_bytes > 0)
1868 ONE_MORE_BYTE (c);
1869 if (c < 0xA0)
1871 src--; /* Unread the last byte. */
1872 break;
1874 more_bytes--;
1876 if (more_bytes != 0)
1877 break;
1878 found = CATEGORY_MASK_EMACS_MULE;
1881 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882 return 0;
1884 no_more_source:
1885 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1887 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1888 return 0;
1890 detect_info->found |= found;
1891 return 1;
1895 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896 character. If CMP_STATUS indicates that we must expect MSEQ or
1897 RULE described above, decode it and return the negative value of
1898 the decoded character or rule. If an invalid byte is found, return
1899 -1. If SRC is too short, return -2. */
1901 static int
1902 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1903 int *nbytes, int *nchars, int *id,
1904 struct composition_status *cmp_status)
1906 const unsigned char *src_end = coding->source + coding->src_bytes;
1907 const unsigned char *src_base = src;
1908 bool multibytep = coding->src_multibyte;
1909 int charset_ID;
1910 unsigned code;
1911 int c;
1912 int consumed_chars = 0;
1913 bool mseq_found = 0;
1915 ONE_MORE_BYTE (c);
1916 if (c < 0)
1918 c = -c;
1919 charset_ID = emacs_mule_charset[0];
1921 else
1923 if (c >= 0xA0)
1925 if (cmp_status->state != COMPOSING_NO
1926 && cmp_status->old_form)
1928 if (cmp_status->state == COMPOSING_CHAR)
1930 if (c == 0xA0)
1932 ONE_MORE_BYTE (c);
1933 c -= 0x80;
1934 if (c < 0)
1935 goto invalid_code;
1937 else
1938 c -= 0x20;
1939 mseq_found = 1;
1941 else
1943 *nbytes = src - src_base;
1944 *nchars = consumed_chars;
1945 return -c;
1948 else
1949 goto invalid_code;
1952 switch (emacs_mule_bytes[c])
1954 case 2:
1955 if ((charset_ID = emacs_mule_charset[c]) < 0)
1956 goto invalid_code;
1957 ONE_MORE_BYTE (c);
1958 if (c < 0xA0)
1959 goto invalid_code;
1960 code = c & 0x7F;
1961 break;
1963 case 3:
1964 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1965 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1967 ONE_MORE_BYTE (c);
1968 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1969 goto invalid_code;
1970 ONE_MORE_BYTE (c);
1971 if (c < 0xA0)
1972 goto invalid_code;
1973 code = c & 0x7F;
1975 else
1977 if ((charset_ID = emacs_mule_charset[c]) < 0)
1978 goto invalid_code;
1979 ONE_MORE_BYTE (c);
1980 if (c < 0xA0)
1981 goto invalid_code;
1982 code = (c & 0x7F) << 8;
1983 ONE_MORE_BYTE (c);
1984 if (c < 0xA0)
1985 goto invalid_code;
1986 code |= c & 0x7F;
1988 break;
1990 case 4:
1991 ONE_MORE_BYTE (c);
1992 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1993 goto invalid_code;
1994 ONE_MORE_BYTE (c);
1995 if (c < 0xA0)
1996 goto invalid_code;
1997 code = (c & 0x7F) << 8;
1998 ONE_MORE_BYTE (c);
1999 if (c < 0xA0)
2000 goto invalid_code;
2001 code |= c & 0x7F;
2002 break;
2004 case 1:
2005 code = c;
2006 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2007 break;
2009 default:
2010 emacs_abort ();
2012 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2013 CHARSET_FROM_ID (charset_ID), code, c);
2014 if (c < 0)
2015 goto invalid_code;
2017 *nbytes = src - src_base;
2018 *nchars = consumed_chars;
2019 if (id)
2020 *id = charset_ID;
2021 return (mseq_found ? -c : c);
2023 no_more_source:
2024 return -2;
2026 invalid_code:
2027 return -1;
2031 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2033 /* Handle these composition sequence ('|': the end of header elements,
2034 BYTES and CHARS >= 0xA0):
2036 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2040 and these old form:
2042 (4) relative composition: 0x80 | MSEQ ... MSEQ
2043 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2045 When the starter 0x80 and the following header elements are found,
2046 this annotation header is produced.
2048 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2050 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2053 Then, upon reading the following elements, these codes are produced
2054 until the composition end is found:
2056 (1) CHAR ... CHAR
2057 (2) ALT ... ALT CHAR ... CHAR
2058 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2059 (4) CHAR ... CHAR
2060 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2062 When the composition end is found, LENGTH and NCHARS in the
2063 annotation header is updated as below:
2065 (1) LENGTH: unchanged, NCHARS: unchanged
2066 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068 (4) LENGTH: unchanged, NCHARS: number of CHARs
2069 (5) LENGTH: unchanged, NCHARS: number of CHARs
2071 If an error is found while composing, the annotation header is
2072 changed to the original composition header (plus filler -1s) as
2073 below:
2075 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076 (5) [ 0x80 0xFF -1 -1- -1 ]
2078 and the sequence [ -2 DECODED-RULE ] is changed to the original
2079 byte sequence as below:
2080 o the original byte sequence is B: [ B -1 ]
2081 o the original byte sequence is B1 B2: [ B1 B2 ]
2083 Most of the routines are implemented by macros because many
2084 variables and labels in the caller decode_coding_emacs_mule must be
2085 accessible, and they are usually called just once (thus doesn't
2086 increase the size of compiled object). */
2088 /* Decode a composition rule represented by C as a component of
2089 composition sequence of Emacs 20 style. Set RULE to the decoded
2090 rule. */
2092 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2093 do { \
2094 int gref, nref; \
2096 c -= 0xA0; \
2097 if (c < 0 || c >= 81) \
2098 goto invalid_code; \
2099 gref = c / 9, nref = c % 9; \
2100 if (gref == 4) gref = 10; \
2101 if (nref == 4) nref = 10; \
2102 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2103 } while (0)
2106 /* Decode a composition rule represented by C and the following byte
2107 at SRC as a component of composition sequence of Emacs 21 style.
2108 Set RULE to the decoded rule. */
2110 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2111 do { \
2112 int gref, nref; \
2114 gref = c - 0x20; \
2115 if (gref < 0 || gref >= 81) \
2116 goto invalid_code; \
2117 ONE_MORE_BYTE (c); \
2118 nref = c - 0x20; \
2119 if (nref < 0 || nref >= 81) \
2120 goto invalid_code; \
2121 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2122 } while (0)
2125 /* Start of Emacs 21 style format. The first three bytes at SRC are
2126 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127 byte length of this composition information, CHARS is the number of
2128 characters composed by this composition. */
2130 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2131 do { \
2132 enum composition_method method = c - 0xF2; \
2133 int nbytes, nchars; \
2135 ONE_MORE_BYTE (c); \
2136 if (c < 0) \
2137 goto invalid_code; \
2138 nbytes = c - 0xA0; \
2139 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2140 goto invalid_code; \
2141 ONE_MORE_BYTE (c); \
2142 nchars = c - 0xA0; \
2143 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2144 goto invalid_code; \
2145 cmp_status->old_form = 0; \
2146 cmp_status->method = method; \
2147 if (method == COMPOSITION_RELATIVE) \
2148 cmp_status->state = COMPOSING_CHAR; \
2149 else \
2150 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2151 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2152 cmp_status->nchars = nchars; \
2153 cmp_status->ncomps = nbytes - 4; \
2154 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2155 } while (0)
2158 /* Start of Emacs 20 style format for relative composition. */
2160 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2161 do { \
2162 cmp_status->old_form = 1; \
2163 cmp_status->method = COMPOSITION_RELATIVE; \
2164 cmp_status->state = COMPOSING_CHAR; \
2165 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2166 cmp_status->nchars = cmp_status->ncomps = 0; \
2167 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2168 } while (0)
2171 /* Start of Emacs 20 style format for rule-base composition. */
2173 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2174 do { \
2175 cmp_status->old_form = 1; \
2176 cmp_status->method = COMPOSITION_WITH_RULE; \
2177 cmp_status->state = COMPOSING_CHAR; \
2178 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2179 cmp_status->nchars = cmp_status->ncomps = 0; \
2180 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2181 } while (0)
2184 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2185 do { \
2186 const unsigned char *current_src = src; \
2188 ONE_MORE_BYTE (c); \
2189 if (c < 0) \
2190 goto invalid_code; \
2191 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2192 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2193 DECODE_EMACS_MULE_21_COMPOSITION (); \
2194 else if (c < 0xA0) \
2195 goto invalid_code; \
2196 else if (c < 0xC0) \
2198 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2199 /* Re-read C as a composition component. */ \
2200 src = current_src; \
2202 else if (c == 0xFF) \
2203 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2204 else \
2205 goto invalid_code; \
2206 } while (0)
2208 #define EMACS_MULE_COMPOSITION_END() \
2209 do { \
2210 int idx = - cmp_status->length; \
2212 if (cmp_status->old_form) \
2213 charbuf[idx + 2] = cmp_status->nchars; \
2214 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2215 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2216 cmp_status->state = COMPOSING_NO; \
2217 } while (0)
2220 static int
2221 emacs_mule_finish_composition (int *charbuf,
2222 struct composition_status *cmp_status)
2224 int idx = - cmp_status->length;
2225 int new_chars;
2227 if (cmp_status->old_form && cmp_status->nchars > 0)
2229 charbuf[idx + 2] = cmp_status->nchars;
2230 new_chars = 0;
2231 if (cmp_status->method == COMPOSITION_WITH_RULE
2232 && cmp_status->state == COMPOSING_CHAR)
2234 /* The last rule was invalid. */
2235 int rule = charbuf[-1] + 0xA0;
2237 charbuf[-2] = BYTE8_TO_CHAR (rule);
2238 charbuf[-1] = -1;
2239 new_chars = 1;
2242 else
2244 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2246 if (cmp_status->method == COMPOSITION_WITH_RULE)
2248 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2249 charbuf[idx++] = -3;
2250 charbuf[idx++] = 0;
2251 new_chars = 1;
2253 else
2255 int nchars = charbuf[idx + 1] + 0xA0;
2256 int nbytes = charbuf[idx + 2] + 0xA0;
2258 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2259 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2260 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2261 charbuf[idx++] = -1;
2262 new_chars = 4;
2265 cmp_status->state = COMPOSING_NO;
2266 return new_chars;
2269 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2270 do { \
2271 if (cmp_status->state != COMPOSING_NO) \
2272 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2273 } while (0)
2276 static void
2277 decode_coding_emacs_mule (struct coding_system *coding)
2279 const unsigned char *src = coding->source + coding->consumed;
2280 const unsigned char *src_end = coding->source + coding->src_bytes;
2281 const unsigned char *src_base;
2282 int *charbuf = coding->charbuf + coding->charbuf_used;
2283 /* We may produce two annotations (charset and composition) in one
2284 loop and one more charset annotation at the end. */
2285 int *charbuf_end
2286 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2287 /* We can produce up to 2 characters in a loop. */
2288 - 1;
2289 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2290 bool multibytep = coding->src_multibyte;
2291 ptrdiff_t char_offset = coding->produced_char;
2292 ptrdiff_t last_offset = char_offset;
2293 int last_id = charset_ascii;
2294 bool eol_dos
2295 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2296 int byte_after_cr = -1;
2297 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2299 if (cmp_status->state != COMPOSING_NO)
2301 int i;
2303 if (charbuf_end - charbuf < cmp_status->length)
2304 emacs_abort ();
2305 for (i = 0; i < cmp_status->length; i++)
2306 *charbuf++ = cmp_status->carryover[i];
2307 coding->annotated = 1;
2310 while (1)
2312 int c, id IF_LINT (= 0);
2314 src_base = src;
2315 consumed_chars_base = consumed_chars;
2317 if (charbuf >= charbuf_end)
2319 if (byte_after_cr >= 0)
2320 src_base--;
2321 break;
2324 if (byte_after_cr >= 0)
2325 c = byte_after_cr, byte_after_cr = -1;
2326 else
2327 ONE_MORE_BYTE (c);
2329 if (c < 0 || c == 0x80)
2331 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2332 if (c < 0)
2334 *charbuf++ = -c;
2335 char_offset++;
2337 else
2338 DECODE_EMACS_MULE_COMPOSITION_START ();
2339 continue;
2342 if (c < 0x80)
2344 if (eol_dos && c == '\r')
2345 ONE_MORE_BYTE (byte_after_cr);
2346 id = charset_ascii;
2347 if (cmp_status->state != COMPOSING_NO)
2349 if (cmp_status->old_form)
2350 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2352 cmp_status->ncomps--;
2355 else
2357 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2358 /* emacs_mule_char can load a charset map from a file, which
2359 allocates a large structure and might cause buffer text
2360 to be relocated as result. Thus, we need to remember the
2361 original pointer to buffer text, and fix up all related
2362 pointers after the call. */
2363 const unsigned char *orig = coding->source;
2364 ptrdiff_t offset;
2366 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2367 cmp_status);
2368 offset = coding->source - orig;
2369 if (offset)
2371 src += offset;
2372 src_base += offset;
2373 src_end += offset;
2375 if (c < 0)
2377 if (c == -1)
2378 goto invalid_code;
2379 if (c == -2)
2380 break;
2382 src = src_base + nbytes;
2383 consumed_chars = consumed_chars_base + nchars;
2384 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2385 cmp_status->ncomps -= nchars;
2388 /* Now if C >= 0, we found a normally encoded character, if C <
2389 0, we found an old-style composition component character or
2390 rule. */
2392 if (cmp_status->state == COMPOSING_NO)
2394 if (last_id != id)
2396 if (last_id != charset_ascii)
2397 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2398 last_id);
2399 last_id = id;
2400 last_offset = char_offset;
2402 *charbuf++ = c;
2403 char_offset++;
2405 else if (cmp_status->state == COMPOSING_CHAR)
2407 if (cmp_status->old_form)
2409 if (c >= 0)
2411 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2412 *charbuf++ = c;
2413 char_offset++;
2415 else
2417 *charbuf++ = -c;
2418 cmp_status->nchars++;
2419 cmp_status->length++;
2420 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2421 EMACS_MULE_COMPOSITION_END ();
2422 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2423 cmp_status->state = COMPOSING_RULE;
2426 else
2428 *charbuf++ = c;
2429 cmp_status->length++;
2430 cmp_status->nchars--;
2431 if (cmp_status->nchars == 0)
2432 EMACS_MULE_COMPOSITION_END ();
2435 else if (cmp_status->state == COMPOSING_RULE)
2437 int rule;
2439 if (c >= 0)
2441 EMACS_MULE_COMPOSITION_END ();
2442 *charbuf++ = c;
2443 char_offset++;
2445 else
2447 c = -c;
2448 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2449 if (rule < 0)
2450 goto invalid_code;
2451 *charbuf++ = -2;
2452 *charbuf++ = rule;
2453 cmp_status->length += 2;
2454 cmp_status->state = COMPOSING_CHAR;
2457 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2459 *charbuf++ = c;
2460 cmp_status->length++;
2461 if (cmp_status->ncomps == 0)
2462 cmp_status->state = COMPOSING_CHAR;
2463 else if (cmp_status->ncomps > 0)
2465 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2466 cmp_status->state = COMPOSING_COMPONENT_RULE;
2468 else
2469 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2471 else /* COMPOSING_COMPONENT_RULE */
2473 int rule;
2475 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2476 if (rule < 0)
2477 goto invalid_code;
2478 *charbuf++ = -2;
2479 *charbuf++ = rule;
2480 cmp_status->length += 2;
2481 cmp_status->ncomps--;
2482 if (cmp_status->ncomps > 0)
2483 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2484 else
2485 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2487 continue;
2489 invalid_code:
2490 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491 src = src_base;
2492 consumed_chars = consumed_chars_base;
2493 ONE_MORE_BYTE (c);
2494 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2495 char_offset++;
2496 coding->errors++;
2499 no_more_source:
2500 if (cmp_status->state != COMPOSING_NO)
2502 if (coding->mode & CODING_MODE_LAST_BLOCK)
2503 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2504 else
2506 int i;
2508 charbuf -= cmp_status->length;
2509 for (i = 0; i < cmp_status->length; i++)
2510 cmp_status->carryover[i] = charbuf[i];
2513 if (last_id != charset_ascii)
2514 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2515 coding->consumed_char += consumed_chars_base;
2516 coding->consumed = src_base - coding->source;
2517 coding->charbuf_used = charbuf - coding->charbuf;
2521 #define EMACS_MULE_LEADING_CODES(id, codes) \
2522 do { \
2523 if (id < 0xA0) \
2524 codes[0] = id, codes[1] = 0; \
2525 else if (id < 0xE0) \
2526 codes[0] = 0x9A, codes[1] = id; \
2527 else if (id < 0xF0) \
2528 codes[0] = 0x9B, codes[1] = id; \
2529 else if (id < 0xF5) \
2530 codes[0] = 0x9C, codes[1] = id; \
2531 else \
2532 codes[0] = 0x9D, codes[1] = id; \
2533 } while (0);
2536 static bool
2537 encode_coding_emacs_mule (struct coding_system *coding)
2539 bool multibytep = coding->dst_multibyte;
2540 int *charbuf = coding->charbuf;
2541 int *charbuf_end = charbuf + coding->charbuf_used;
2542 unsigned char *dst = coding->destination + coding->produced;
2543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2544 int safe_room = 8;
2545 ptrdiff_t produced_chars = 0;
2546 Lisp_Object attrs, charset_list;
2547 int c;
2548 int preferred_charset_id = -1;
2550 CODING_GET_INFO (coding, attrs, charset_list);
2551 if (! EQ (charset_list, Vemacs_mule_charset_list))
2553 charset_list = Vemacs_mule_charset_list;
2554 ASET (attrs, coding_attr_charset_list, charset_list);
2557 while (charbuf < charbuf_end)
2559 ASSURE_DESTINATION (safe_room);
2560 c = *charbuf++;
2562 if (c < 0)
2564 /* Handle an annotation. */
2565 switch (*charbuf)
2567 case CODING_ANNOTATE_COMPOSITION_MASK:
2568 /* Not yet implemented. */
2569 break;
2570 case CODING_ANNOTATE_CHARSET_MASK:
2571 preferred_charset_id = charbuf[3];
2572 if (preferred_charset_id >= 0
2573 && NILP (Fmemq (make_number (preferred_charset_id),
2574 charset_list)))
2575 preferred_charset_id = -1;
2576 break;
2577 default:
2578 emacs_abort ();
2580 charbuf += -c - 1;
2581 continue;
2584 if (ASCII_CHAR_P (c))
2585 EMIT_ONE_ASCII_BYTE (c);
2586 else if (CHAR_BYTE8_P (c))
2588 c = CHAR_TO_BYTE8 (c);
2589 EMIT_ONE_BYTE (c);
2591 else
2593 struct charset *charset;
2594 unsigned code;
2595 int dimension;
2596 int emacs_mule_id;
2597 unsigned char leading_codes[2];
2599 if (preferred_charset_id >= 0)
2601 bool result;
2603 charset = CHARSET_FROM_ID (preferred_charset_id);
2604 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2605 if (result)
2606 code = ENCODE_CHAR (charset, c);
2607 else
2608 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609 &code, charset);
2611 else
2612 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2613 &code, charset);
2614 if (! charset)
2616 c = coding->default_char;
2617 if (ASCII_CHAR_P (c))
2619 EMIT_ONE_ASCII_BYTE (c);
2620 continue;
2622 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2623 &code, charset);
2625 dimension = CHARSET_DIMENSION (charset);
2626 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2627 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2628 EMIT_ONE_BYTE (leading_codes[0]);
2629 if (leading_codes[1])
2630 EMIT_ONE_BYTE (leading_codes[1]);
2631 if (dimension == 1)
2632 EMIT_ONE_BYTE (code | 0x80);
2633 else
2635 code |= 0x8080;
2636 EMIT_ONE_BYTE (code >> 8);
2637 EMIT_ONE_BYTE (code & 0xFF);
2641 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2642 coding->produced_char += produced_chars;
2643 coding->produced = dst - coding->destination;
2644 return 0;
2648 /*** 7. ISO2022 handlers ***/
2650 /* The following note describes the coding system ISO2022 briefly.
2651 Since the intention of this note is to help understand the
2652 functions in this file, some parts are NOT ACCURATE or are OVERLY
2653 SIMPLIFIED. For thorough understanding, please refer to the
2654 original document of ISO2022. This is equivalent to the standard
2655 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2657 ISO2022 provides many mechanisms to encode several character sets
2658 in 7-bit and 8-bit environments. For 7-bit environments, all text
2659 is encoded using bytes less than 128. This may make the encoded
2660 text a little bit longer, but the text passes more easily through
2661 several types of gateway, some of which strip off the MSB (Most
2662 Significant Bit).
2664 There are two kinds of character sets: control character sets and
2665 graphic character sets. The former contain control characters such
2666 as `newline' and `escape' to provide control functions (control
2667 functions are also provided by escape sequences). The latter
2668 contain graphic characters such as 'A' and '-'. Emacs recognizes
2669 two control character sets and many graphic character sets.
2671 Graphic character sets are classified into one of the following
2672 four classes, according to the number of bytes (DIMENSION) and
2673 number of characters in one dimension (CHARS) of the set:
2674 - DIMENSION1_CHARS94
2675 - DIMENSION1_CHARS96
2676 - DIMENSION2_CHARS94
2677 - DIMENSION2_CHARS96
2679 In addition, each character set is assigned an identification tag,
2680 unique for each set, called the "final character" (denoted as <F>
2681 hereafter). The <F> of each character set is decided by ECMA(*)
2682 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2683 (0x30..0x3F are for private use only).
2685 Note (*): ECMA = European Computer Manufacturers Association
2687 Here are examples of graphic character sets [NAME(<F>)]:
2688 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691 o DIMENSION2_CHARS96 -- none for the moment
2693 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2694 C0 [0x00..0x1F] -- control character plane 0
2695 GL [0x20..0x7F] -- graphic character plane 0
2696 C1 [0x80..0x9F] -- control character plane 1
2697 GR [0xA0..0xFF] -- graphic character plane 1
2699 A control character set is directly designated and invoked to C0 or
2700 C1 by an escape sequence. The most common case is that:
2701 - ISO646's control character set is designated/invoked to C0, and
2702 - ISO6429's control character set is designated/invoked to C1,
2703 and usually these designations/invocations are omitted in encoded
2704 text. In a 7-bit environment, only C0 can be used, and a control
2705 character for C1 is encoded by an appropriate escape sequence to
2706 fit into the environment. All control characters for C1 are
2707 defined to have corresponding escape sequences.
2709 A graphic character set is at first designated to one of four
2710 graphic registers (G0 through G3), then these graphic registers are
2711 invoked to GL or GR. These designations and invocations can be
2712 done independently. The most common case is that G0 is invoked to
2713 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2714 these invocations and designations are omitted in encoded text.
2715 In a 7-bit environment, only GL can be used.
2717 When a graphic character set of CHARS94 is invoked to GL, codes
2718 0x20 and 0x7F of the GL area work as control characters SPACE and
2719 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2720 be used.
2722 There are two ways of invocation: locking-shift and single-shift.
2723 With locking-shift, the invocation lasts until the next different
2724 invocation, whereas with single-shift, the invocation affects the
2725 following character only and doesn't affect the locking-shift
2726 state. Invocations are done by the following control characters or
2727 escape sequences:
2729 ----------------------------------------------------------------------
2730 abbrev function cntrl escape seq description
2731 ----------------------------------------------------------------------
2732 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2733 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2734 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2735 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2736 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2737 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2738 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2739 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2740 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2741 ----------------------------------------------------------------------
2742 (*) These are not used by any known coding system.
2744 Control characters for these functions are defined by macros
2745 ISO_CODE_XXX in `coding.h'.
2747 Designations are done by the following escape sequences:
2748 ----------------------------------------------------------------------
2749 escape sequence description
2750 ----------------------------------------------------------------------
2751 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2752 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2753 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2754 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2755 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2756 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2757 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2758 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2759 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2760 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2761 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2762 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2763 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2764 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2765 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2766 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2767 ----------------------------------------------------------------------
2769 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2770 of dimension 1, chars 94, and final character <F>, etc...
2772 Note (*): Although these designations are not allowed in ISO2022,
2773 Emacs accepts them on decoding, and produces them on encoding
2774 CHARS96 character sets in a coding system which is characterized as
2775 7-bit environment, non-locking-shift, and non-single-shift.
2777 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2778 '(' must be omitted. We refer to this as "short-form" hereafter.
2780 Now you may notice that there are a lot of ways of encoding the
2781 same multilingual text in ISO2022. Actually, there exist many
2782 coding systems such as Compound Text (used in X11's inter client
2783 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2785 localized platforms), and all of these are variants of ISO2022.
2787 In addition to the above, Emacs handles two more kinds of escape
2788 sequences: ISO6429's direction specification and Emacs' private
2789 sequence for specifying character composition.
2791 ISO6429's direction specification takes the following form:
2792 o CSI ']' -- end of the current direction
2793 o CSI '0' ']' -- end of the current direction
2794 o CSI '1' ']' -- start of left-to-right text
2795 o CSI '2' ']' -- start of right-to-left text
2796 The control character CSI (0x9B: control sequence introducer) is
2797 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2799 Character composition specification takes the following form:
2800 o ESC '0' -- start relative composition
2801 o ESC '1' -- end composition
2802 o ESC '2' -- start rule-base composition (*)
2803 o ESC '3' -- start relative composition with alternate chars (**)
2804 o ESC '4' -- start rule-base composition with alternate chars (**)
2805 Since these are not standard escape sequences of any ISO standard,
2806 the use of them with these meanings is restricted to Emacs only.
2808 (*) This form is used only in Emacs 20.7 and older versions,
2809 but newer versions can safely decode it.
2810 (**) This form is used only in Emacs 21.1 and newer versions,
2811 and older versions can't decode it.
2813 Here's a list of example usages of these composition escape
2814 sequences (categorized by `enum composition_method').
2816 COMPOSITION_RELATIVE:
2817 ESC 0 CHAR [ CHAR ] ESC 1
2818 COMPOSITION_WITH_RULE:
2819 ESC 2 CHAR [ RULE CHAR ] ESC 1
2820 COMPOSITION_WITH_ALTCHARS:
2821 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2822 COMPOSITION_WITH_RULE_ALTCHARS:
2823 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2825 static enum iso_code_class_type iso_code_class[256];
2827 #define SAFE_CHARSET_P(coding, id) \
2828 ((id) <= (coding)->max_charset_id \
2829 && (coding)->safe_charsets[id] != 255)
2831 static void
2832 setup_iso_safe_charsets (Lisp_Object attrs)
2834 Lisp_Object charset_list, safe_charsets;
2835 Lisp_Object request;
2836 Lisp_Object reg_usage;
2837 Lisp_Object tail;
2838 EMACS_INT reg94, reg96;
2839 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2840 int max_charset_id;
2842 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2843 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2844 && ! EQ (charset_list, Viso_2022_charset_list))
2846 charset_list = Viso_2022_charset_list;
2847 ASET (attrs, coding_attr_charset_list, charset_list);
2848 ASET (attrs, coding_attr_safe_charsets, Qnil);
2851 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2852 return;
2854 max_charset_id = 0;
2855 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2857 int id = XINT (XCAR (tail));
2858 if (max_charset_id < id)
2859 max_charset_id = id;
2862 safe_charsets = make_uninit_string (max_charset_id + 1);
2863 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2864 request = AREF (attrs, coding_attr_iso_request);
2865 reg_usage = AREF (attrs, coding_attr_iso_usage);
2866 reg94 = XINT (XCAR (reg_usage));
2867 reg96 = XINT (XCDR (reg_usage));
2869 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2871 Lisp_Object id;
2872 Lisp_Object reg;
2873 struct charset *charset;
2875 id = XCAR (tail);
2876 charset = CHARSET_FROM_ID (XINT (id));
2877 reg = Fcdr (Fassq (id, request));
2878 if (! NILP (reg))
2879 SSET (safe_charsets, XINT (id), XINT (reg));
2880 else if (charset->iso_chars_96)
2882 if (reg96 < 4)
2883 SSET (safe_charsets, XINT (id), reg96);
2885 else
2887 if (reg94 < 4)
2888 SSET (safe_charsets, XINT (id), reg94);
2891 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896 Return true if a text is encoded in one of ISO-2022 based coding
2897 systems. */
2899 static bool
2900 detect_coding_iso_2022 (struct coding_system *coding,
2901 struct coding_detection_info *detect_info)
2903 const unsigned char *src = coding->source, *src_base = src;
2904 const unsigned char *src_end = coding->source + coding->src_bytes;
2905 bool multibytep = coding->src_multibyte;
2906 bool single_shifting = 0;
2907 int id;
2908 int c, c1;
2909 ptrdiff_t consumed_chars = 0;
2910 int i;
2911 int rejected = 0;
2912 int found = 0;
2913 int composition_count = -1;
2915 detect_info->checked |= CATEGORY_MASK_ISO;
2917 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2919 struct coding_system *this = &(coding_categories[i]);
2920 Lisp_Object attrs, val;
2922 if (this->id < 0)
2923 continue;
2924 attrs = CODING_ID_ATTRS (this->id);
2925 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2926 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2927 setup_iso_safe_charsets (attrs);
2928 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2929 this->max_charset_id = SCHARS (val) - 1;
2930 this->safe_charsets = SDATA (val);
2933 /* A coding system of this category is always ASCII compatible. */
2934 src += coding->head_ascii;
2936 while (rejected != CATEGORY_MASK_ISO)
2938 src_base = src;
2939 ONE_MORE_BYTE (c);
2940 switch (c)
2942 case ISO_CODE_ESC:
2943 if (inhibit_iso_escape_detection)
2944 break;
2945 single_shifting = 0;
2946 ONE_MORE_BYTE (c);
2947 if (c == 'N' || c == 'O')
2949 /* ESC <Fe> for SS2 or SS3. */
2950 single_shifting = 1;
2951 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2953 else if (c == '1')
2955 /* End of composition. */
2956 if (composition_count < 0
2957 || composition_count > MAX_COMPOSITION_COMPONENTS)
2958 /* Invalid */
2959 break;
2960 composition_count = -1;
2961 found |= CATEGORY_MASK_ISO;
2963 else if (c >= '0' && c <= '4')
2965 /* ESC <Fp> for start/end composition. */
2966 composition_count = 0;
2968 else
2970 if (c >= '(' && c <= '/')
2972 /* Designation sequence for a charset of dimension 1. */
2973 ONE_MORE_BYTE (c1);
2974 if (c1 < ' ' || c1 >= 0x80
2975 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2976 /* Invalid designation sequence. Just ignore. */
2977 break;
2979 else if (c == '$')
2981 /* Designation sequence for a charset of dimension 2. */
2982 ONE_MORE_BYTE (c);
2983 if (c >= '@' && c <= 'B')
2984 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2985 id = iso_charset_table[1][0][c];
2986 else if (c >= '(' && c <= '/')
2988 ONE_MORE_BYTE (c1);
2989 if (c1 < ' ' || c1 >= 0x80
2990 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2991 /* Invalid designation sequence. Just ignore. */
2992 break;
2994 else
2995 /* Invalid designation sequence. Just ignore it. */
2996 break;
2998 else
3000 /* Invalid escape sequence. Just ignore it. */
3001 break;
3004 /* We found a valid designation sequence for CHARSET. */
3005 rejected |= CATEGORY_MASK_ISO_8BIT;
3006 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3007 id))
3008 found |= CATEGORY_MASK_ISO_7;
3009 else
3010 rejected |= CATEGORY_MASK_ISO_7;
3011 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3012 id))
3013 found |= CATEGORY_MASK_ISO_7_TIGHT;
3014 else
3015 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3016 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3017 id))
3018 found |= CATEGORY_MASK_ISO_7_ELSE;
3019 else
3020 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3021 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3022 id))
3023 found |= CATEGORY_MASK_ISO_8_ELSE;
3024 else
3025 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3027 break;
3029 case ISO_CODE_SO:
3030 case ISO_CODE_SI:
3031 /* Locking shift out/in. */
3032 if (inhibit_iso_escape_detection)
3033 break;
3034 single_shifting = 0;
3035 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3036 break;
3038 case ISO_CODE_CSI:
3039 /* Control sequence introducer. */
3040 single_shifting = 0;
3041 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3042 found |= CATEGORY_MASK_ISO_8_ELSE;
3043 goto check_extra_latin;
3045 case ISO_CODE_SS2:
3046 case ISO_CODE_SS3:
3047 /* Single shift. */
3048 if (inhibit_iso_escape_detection)
3049 break;
3050 single_shifting = 0;
3051 rejected |= CATEGORY_MASK_ISO_7BIT;
3052 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT)
3055 found |= CATEGORY_MASK_ISO_8_1;
3056 single_shifting = 1;
3058 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3059 & CODING_ISO_FLAG_SINGLE_SHIFT)
3061 found |= CATEGORY_MASK_ISO_8_2;
3062 single_shifting = 1;
3064 if (single_shifting)
3065 break;
3066 goto check_extra_latin;
3068 default:
3069 if (c < 0)
3070 continue;
3071 if (c < 0x80)
3073 if (composition_count >= 0)
3074 composition_count++;
3075 single_shifting = 0;
3076 break;
3078 if (c >= 0xA0)
3080 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3081 found |= CATEGORY_MASK_ISO_8_1;
3082 /* Check the length of succeeding codes of the range
3083 0xA0..0FF. If the byte length is even, we include
3084 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3085 only when we are not single shifting. */
3086 if (! single_shifting
3087 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3089 int len = 1;
3090 while (src < src_end)
3092 src_base = src;
3093 ONE_MORE_BYTE (c);
3094 if (c < 0xA0)
3096 src = src_base;
3097 break;
3099 len++;
3102 if (len & 1 && src < src_end)
3104 rejected |= CATEGORY_MASK_ISO_8_2;
3105 if (composition_count >= 0)
3106 composition_count += len;
3108 else
3110 found |= CATEGORY_MASK_ISO_8_2;
3111 if (composition_count >= 0)
3112 composition_count += len / 2;
3115 break;
3117 check_extra_latin:
3118 if (! VECTORP (Vlatin_extra_code_table)
3119 || NILP (AREF (Vlatin_extra_code_table, c)))
3121 rejected = CATEGORY_MASK_ISO;
3122 break;
3124 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3125 & CODING_ISO_FLAG_LATIN_EXTRA)
3126 found |= CATEGORY_MASK_ISO_8_1;
3127 else
3128 rejected |= CATEGORY_MASK_ISO_8_1;
3129 rejected |= CATEGORY_MASK_ISO_8_2;
3130 break;
3133 detect_info->rejected |= CATEGORY_MASK_ISO;
3134 return 0;
3136 no_more_source:
3137 detect_info->rejected |= rejected;
3138 detect_info->found |= (found & ~rejected);
3139 return 1;
3143 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3144 escape sequence should be kept. */
3145 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3146 do { \
3147 int id, prev; \
3149 if (final < '0' || final >= 128 \
3150 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3151 || !SAFE_CHARSET_P (coding, id)) \
3153 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3154 chars_96 = -1; \
3155 break; \
3157 prev = CODING_ISO_DESIGNATION (coding, reg); \
3158 if (id == charset_jisx0201_roman) \
3160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3161 id = charset_ascii; \
3163 else if (id == charset_jisx0208_1978) \
3165 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3166 id = charset_jisx0208; \
3168 CODING_ISO_DESIGNATION (coding, reg) = id; \
3169 /* If there was an invalid designation to REG previously, and this \
3170 designation is ASCII to REG, we should keep this designation \
3171 sequence. */ \
3172 if (prev == -2 && id == charset_ascii) \
3173 chars_96 = -1; \
3174 } while (0)
3177 /* Handle these composition sequence (ALT: alternate char):
3179 (1) relative composition: ESC 0 CHAR ... ESC 1
3180 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3181 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3182 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3184 When the start sequence (ESC 0/2/3/4) is found, this annotation
3185 header is produced.
3187 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3189 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3190 produced until the end sequence (ESC 1) is found:
3192 (1) CHAR ... CHAR
3193 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3194 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3195 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3197 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3198 annotation header is updated as below:
3200 (1) LENGTH: unchanged, NCHARS: number of CHARs
3201 (2) LENGTH: unchanged, NCHARS: number of CHARs
3202 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3203 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3205 If an error is found while composing, the annotation header is
3206 changed to:
3208 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3210 and the sequence [ -2 DECODED-RULE ] is changed to the original
3211 byte sequence as below:
3212 o the original byte sequence is B: [ B -1 ]
3213 o the original byte sequence is B1 B2: [ B1 B2 ]
3214 and the sequence [ -1 -1 ] is changed to the original byte
3215 sequence:
3216 [ ESC '0' ]
3219 /* Decode a composition rule C1 and maybe one more byte from the
3220 source, and set RULE to the encoded composition rule. If the rule
3221 is invalid, goto invalid_code. */
3223 #define DECODE_COMPOSITION_RULE(rule) \
3224 do { \
3225 rule = c1 - 32; \
3226 if (rule < 0) \
3227 goto invalid_code; \
3228 if (rule < 81) /* old format (before ver.21) */ \
3230 int gref = (rule) / 9; \
3231 int nref = (rule) % 9; \
3232 if (gref == 4) gref = 10; \
3233 if (nref == 4) nref = 10; \
3234 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3236 else /* new format (after ver.21) */ \
3238 int b; \
3240 ONE_MORE_BYTE (b); \
3241 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3242 goto invalid_code; \
3243 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3244 rule += 0x100; /* Distinguish it from the old format. */ \
3246 } while (0)
3248 #define ENCODE_COMPOSITION_RULE(rule) \
3249 do { \
3250 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3252 if (rule < 0x100) /* old format */ \
3254 if (gref == 10) gref = 4; \
3255 if (nref == 10) nref = 4; \
3256 charbuf[idx] = 32 + gref * 9 + nref; \
3257 charbuf[idx + 1] = -1; \
3258 new_chars++; \
3260 else /* new format */ \
3262 charbuf[idx] = 32 + 81 + gref; \
3263 charbuf[idx + 1] = 32 + nref; \
3264 new_chars += 2; \
3266 } while (0)
3268 /* Finish the current composition as invalid. */
3270 static int
3271 finish_composition (int *charbuf, struct composition_status *cmp_status)
3273 int idx = - cmp_status->length;
3274 int new_chars;
3276 /* Recover the original ESC sequence */
3277 charbuf[idx++] = ISO_CODE_ESC;
3278 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3279 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3280 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3281 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3282 : '4');
3283 charbuf[idx++] = -2;
3284 charbuf[idx++] = 0;
3285 charbuf[idx++] = -1;
3286 new_chars = cmp_status->nchars;
3287 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3288 for (; idx < 0; idx++)
3290 int elt = charbuf[idx];
3292 if (elt == -2)
3294 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3295 idx++;
3297 else if (elt == -1)
3299 charbuf[idx++] = ISO_CODE_ESC;
3300 charbuf[idx] = '0';
3301 new_chars += 2;
3304 cmp_status->state = COMPOSING_NO;
3305 return new_chars;
3308 /* If characters are under composition, finish the composition. */
3309 #define MAYBE_FINISH_COMPOSITION() \
3310 do { \
3311 if (cmp_status->state != COMPOSING_NO) \
3312 char_offset += finish_composition (charbuf, cmp_status); \
3313 } while (0)
3315 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3317 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3318 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3319 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3320 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3322 Produce this annotation sequence now:
3324 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3327 #define DECODE_COMPOSITION_START(c1) \
3328 do { \
3329 if (c1 == '0' \
3330 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3331 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3332 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3333 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3335 *charbuf++ = -1; \
3336 *charbuf++= -1; \
3337 cmp_status->state = COMPOSING_CHAR; \
3338 cmp_status->length += 2; \
3340 else \
3342 MAYBE_FINISH_COMPOSITION (); \
3343 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3344 : c1 == '2' ? COMPOSITION_WITH_RULE \
3345 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3346 : COMPOSITION_WITH_RULE_ALTCHARS); \
3347 cmp_status->state \
3348 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3349 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3350 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3351 cmp_status->nchars = cmp_status->ncomps = 0; \
3352 coding->annotated = 1; \
3354 } while (0)
3357 /* Handle composition end sequence ESC 1. */
3359 #define DECODE_COMPOSITION_END() \
3360 do { \
3361 if (cmp_status->nchars == 0 \
3362 || ((cmp_status->state == COMPOSING_CHAR) \
3363 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3365 MAYBE_FINISH_COMPOSITION (); \
3366 goto invalid_code; \
3368 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3369 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3370 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3371 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3372 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3373 char_offset += cmp_status->nchars; \
3374 cmp_status->state = COMPOSING_NO; \
3375 } while (0)
3377 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3379 #define STORE_COMPOSITION_RULE(rule) \
3380 do { \
3381 *charbuf++ = -2; \
3382 *charbuf++ = rule; \
3383 cmp_status->length += 2; \
3384 cmp_status->state--; \
3385 } while (0)
3387 /* Store a composed char or a component char C in charbuf, and update
3388 cmp_status. */
3390 #define STORE_COMPOSITION_CHAR(c) \
3391 do { \
3392 *charbuf++ = (c); \
3393 cmp_status->length++; \
3394 if (cmp_status->state == COMPOSING_CHAR) \
3395 cmp_status->nchars++; \
3396 else \
3397 cmp_status->ncomps++; \
3398 if (cmp_status->method == COMPOSITION_WITH_RULE \
3399 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3400 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3401 cmp_status->state++; \
3402 } while (0)
3405 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3407 static void
3408 decode_coding_iso_2022 (struct coding_system *coding)
3410 const unsigned char *src = coding->source + coding->consumed;
3411 const unsigned char *src_end = coding->source + coding->src_bytes;
3412 const unsigned char *src_base;
3413 int *charbuf = coding->charbuf + coding->charbuf_used;
3414 /* We may produce two annotations (charset and composition) in one
3415 loop and one more charset annotation at the end. */
3416 int *charbuf_end
3417 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3418 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3419 bool multibytep = coding->src_multibyte;
3420 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3421 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3422 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3423 int charset_id_2, charset_id_3;
3424 struct charset *charset;
3425 int c;
3426 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3427 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3428 ptrdiff_t char_offset = coding->produced_char;
3429 ptrdiff_t last_offset = char_offset;
3430 int last_id = charset_ascii;
3431 bool eol_dos
3432 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3433 int byte_after_cr = -1;
3434 int i;
3436 setup_iso_safe_charsets (attrs);
3437 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3439 if (cmp_status->state != COMPOSING_NO)
3441 if (charbuf_end - charbuf < cmp_status->length)
3442 emacs_abort ();
3443 for (i = 0; i < cmp_status->length; i++)
3444 *charbuf++ = cmp_status->carryover[i];
3445 coding->annotated = 1;
3448 while (1)
3450 int c1, c2, c3;
3452 src_base = src;
3453 consumed_chars_base = consumed_chars;
3455 if (charbuf >= charbuf_end)
3457 if (byte_after_cr >= 0)
3458 src_base--;
3459 break;
3462 if (byte_after_cr >= 0)
3463 c1 = byte_after_cr, byte_after_cr = -1;
3464 else
3465 ONE_MORE_BYTE (c1);
3466 if (c1 < 0)
3467 goto invalid_code;
3469 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3471 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3472 char_offset++;
3473 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3474 continue;
3477 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3479 if (c1 == ISO_CODE_ESC)
3481 if (src + 1 >= src_end)
3482 goto no_more_source;
3483 *charbuf++ = ISO_CODE_ESC;
3484 char_offset++;
3485 if (src[0] == '%' && src[1] == '@')
3487 src += 2;
3488 consumed_chars += 2;
3489 char_offset += 2;
3490 /* We are sure charbuf can contain two more chars. */
3491 *charbuf++ = '%';
3492 *charbuf++ = '@';
3493 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3496 else
3498 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3499 char_offset++;
3501 continue;
3504 if ((cmp_status->state == COMPOSING_RULE
3505 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3506 && c1 != ISO_CODE_ESC)
3508 int rule;
3510 DECODE_COMPOSITION_RULE (rule);
3511 STORE_COMPOSITION_RULE (rule);
3512 continue;
3515 /* We produce at most one character. */
3516 switch (iso_code_class [c1])
3518 case ISO_0x20_or_0x7F:
3519 if (charset_id_0 < 0
3520 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3521 /* This is SPACE or DEL. */
3522 charset = CHARSET_FROM_ID (charset_ascii);
3523 else
3524 charset = CHARSET_FROM_ID (charset_id_0);
3525 break;
3527 case ISO_graphic_plane_0:
3528 if (charset_id_0 < 0)
3529 charset = CHARSET_FROM_ID (charset_ascii);
3530 else
3531 charset = CHARSET_FROM_ID (charset_id_0);
3532 break;
3534 case ISO_0xA0_or_0xFF:
3535 if (charset_id_1 < 0
3536 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3537 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3538 goto invalid_code;
3539 /* This is a graphic character, we fall down ... */
3541 case ISO_graphic_plane_1:
3542 if (charset_id_1 < 0)
3543 goto invalid_code;
3544 charset = CHARSET_FROM_ID (charset_id_1);
3545 break;
3547 case ISO_control_0:
3548 if (eol_dos && c1 == '\r')
3549 ONE_MORE_BYTE (byte_after_cr);
3550 MAYBE_FINISH_COMPOSITION ();
3551 charset = CHARSET_FROM_ID (charset_ascii);
3552 break;
3554 case ISO_control_1:
3555 goto invalid_code;
3557 case ISO_shift_out:
3558 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3559 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3560 goto invalid_code;
3561 CODING_ISO_INVOCATION (coding, 0) = 1;
3562 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3563 continue;
3565 case ISO_shift_in:
3566 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3567 goto invalid_code;
3568 CODING_ISO_INVOCATION (coding, 0) = 0;
3569 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3570 continue;
3572 case ISO_single_shift_2_7:
3573 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3574 goto invalid_code;
3575 case ISO_single_shift_2:
3576 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577 goto invalid_code;
3578 /* SS2 is handled as an escape sequence of ESC 'N' */
3579 c1 = 'N';
3580 goto label_escape_sequence;
3582 case ISO_single_shift_3:
3583 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3584 goto invalid_code;
3585 /* SS2 is handled as an escape sequence of ESC 'O' */
3586 c1 = 'O';
3587 goto label_escape_sequence;
3589 case ISO_control_sequence_introducer:
3590 /* CSI is handled as an escape sequence of ESC '[' ... */
3591 c1 = '[';
3592 goto label_escape_sequence;
3594 case ISO_escape:
3595 ONE_MORE_BYTE (c1);
3596 label_escape_sequence:
3597 /* Escape sequences handled here are invocation,
3598 designation, direction specification, and character
3599 composition specification. */
3600 switch (c1)
3602 case '&': /* revision of following character set */
3603 ONE_MORE_BYTE (c1);
3604 if (!(c1 >= '@' && c1 <= '~'))
3605 goto invalid_code;
3606 ONE_MORE_BYTE (c1);
3607 if (c1 != ISO_CODE_ESC)
3608 goto invalid_code;
3609 ONE_MORE_BYTE (c1);
3610 goto label_escape_sequence;
3612 case '$': /* designation of 2-byte character set */
3613 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3614 goto invalid_code;
3616 int reg, chars96;
3618 ONE_MORE_BYTE (c1);
3619 if (c1 >= '@' && c1 <= 'B')
3620 { /* designation of JISX0208.1978, GB2312.1980,
3621 or JISX0208.1980 */
3622 reg = 0, chars96 = 0;
3624 else if (c1 >= 0x28 && c1 <= 0x2B)
3625 { /* designation of DIMENSION2_CHARS94 character set */
3626 reg = c1 - 0x28, chars96 = 0;
3627 ONE_MORE_BYTE (c1);
3629 else if (c1 >= 0x2C && c1 <= 0x2F)
3630 { /* designation of DIMENSION2_CHARS96 character set */
3631 reg = c1 - 0x2C, chars96 = 1;
3632 ONE_MORE_BYTE (c1);
3634 else
3635 goto invalid_code;
3636 DECODE_DESIGNATION (reg, 2, chars96, c1);
3637 /* We must update these variables now. */
3638 if (reg == 0)
3639 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640 else if (reg == 1)
3641 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3642 if (chars96 < 0)
3643 goto invalid_code;
3645 continue;
3647 case 'n': /* invocation of locking-shift-2 */
3648 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3649 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3650 goto invalid_code;
3651 CODING_ISO_INVOCATION (coding, 0) = 2;
3652 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3653 continue;
3655 case 'o': /* invocation of locking-shift-3 */
3656 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3657 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3658 goto invalid_code;
3659 CODING_ISO_INVOCATION (coding, 0) = 3;
3660 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3661 continue;
3663 case 'N': /* invocation of single-shift-2 */
3664 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3665 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3666 goto invalid_code;
3667 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3668 if (charset_id_2 < 0)
3669 charset = CHARSET_FROM_ID (charset_ascii);
3670 else
3671 charset = CHARSET_FROM_ID (charset_id_2);
3672 ONE_MORE_BYTE (c1);
3673 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3674 goto invalid_code;
3675 break;
3677 case 'O': /* invocation of single-shift-3 */
3678 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3679 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3680 goto invalid_code;
3681 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3682 if (charset_id_3 < 0)
3683 charset = CHARSET_FROM_ID (charset_ascii);
3684 else
3685 charset = CHARSET_FROM_ID (charset_id_3);
3686 ONE_MORE_BYTE (c1);
3687 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3688 goto invalid_code;
3689 break;
3691 case '0': case '2': case '3': case '4': /* start composition */
3692 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3693 goto invalid_code;
3694 if (last_id != charset_ascii)
3696 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3697 last_id = charset_ascii;
3698 last_offset = char_offset;
3700 DECODE_COMPOSITION_START (c1);
3701 continue;
3703 case '1': /* end composition */
3704 if (cmp_status->state == COMPOSING_NO)
3705 goto invalid_code;
3706 DECODE_COMPOSITION_END ();
3707 continue;
3709 case '[': /* specification of direction */
3710 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3711 goto invalid_code;
3712 /* For the moment, nested direction is not supported.
3713 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3714 left-to-right, and nonzero means right-to-left. */
3715 ONE_MORE_BYTE (c1);
3716 switch (c1)
3718 case ']': /* end of the current direction */
3719 coding->mode &= ~CODING_MODE_DIRECTION;
3721 case '0': /* end of the current direction */
3722 case '1': /* start of left-to-right direction */
3723 ONE_MORE_BYTE (c1);
3724 if (c1 == ']')
3725 coding->mode &= ~CODING_MODE_DIRECTION;
3726 else
3727 goto invalid_code;
3728 break;
3730 case '2': /* start of right-to-left direction */
3731 ONE_MORE_BYTE (c1);
3732 if (c1 == ']')
3733 coding->mode |= CODING_MODE_DIRECTION;
3734 else
3735 goto invalid_code;
3736 break;
3738 default:
3739 goto invalid_code;
3741 continue;
3743 case '%':
3744 ONE_MORE_BYTE (c1);
3745 if (c1 == '/')
3747 /* CTEXT extended segment:
3748 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3749 We keep these bytes as is for the moment.
3750 They may be decoded by post-read-conversion. */
3751 int dim, M, L;
3752 int size;
3754 ONE_MORE_BYTE (dim);
3755 if (dim < '0' || dim > '4')
3756 goto invalid_code;
3757 ONE_MORE_BYTE (M);
3758 if (M < 128)
3759 goto invalid_code;
3760 ONE_MORE_BYTE (L);
3761 if (L < 128)
3762 goto invalid_code;
3763 size = ((M - 128) * 128) + (L - 128);
3764 if (charbuf + 6 > charbuf_end)
3765 goto break_loop;
3766 *charbuf++ = ISO_CODE_ESC;
3767 *charbuf++ = '%';
3768 *charbuf++ = '/';
3769 *charbuf++ = dim;
3770 *charbuf++ = BYTE8_TO_CHAR (M);
3771 *charbuf++ = BYTE8_TO_CHAR (L);
3772 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3774 else if (c1 == 'G')
3776 /* XFree86 extension for embedding UTF-8 in CTEXT:
3777 ESC % G --UTF-8-BYTES-- ESC % @
3778 We keep these bytes as is for the moment.
3779 They may be decoded by post-read-conversion. */
3780 if (charbuf + 3 > charbuf_end)
3781 goto break_loop;
3782 *charbuf++ = ISO_CODE_ESC;
3783 *charbuf++ = '%';
3784 *charbuf++ = 'G';
3785 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3787 else
3788 goto invalid_code;
3789 continue;
3790 break;
3792 default:
3793 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3794 goto invalid_code;
3796 int reg, chars96;
3798 if (c1 >= 0x28 && c1 <= 0x2B)
3799 { /* designation of DIMENSION1_CHARS94 character set */
3800 reg = c1 - 0x28, chars96 = 0;
3801 ONE_MORE_BYTE (c1);
3803 else if (c1 >= 0x2C && c1 <= 0x2F)
3804 { /* designation of DIMENSION1_CHARS96 character set */
3805 reg = c1 - 0x2C, chars96 = 1;
3806 ONE_MORE_BYTE (c1);
3808 else
3809 goto invalid_code;
3810 DECODE_DESIGNATION (reg, 1, chars96, c1);
3811 /* We must update these variables now. */
3812 if (reg == 0)
3813 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3814 else if (reg == 1)
3815 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3816 if (chars96 < 0)
3817 goto invalid_code;
3819 continue;
3821 break;
3823 default:
3824 emacs_abort ();
3827 if (cmp_status->state == COMPOSING_NO
3828 && charset->id != charset_ascii
3829 && last_id != charset->id)
3831 if (last_id != charset_ascii)
3832 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3833 last_id = charset->id;
3834 last_offset = char_offset;
3837 /* Now we know CHARSET and 1st position code C1 of a character.
3838 Produce a decoded character while getting 2nd and 3rd
3839 position codes C2, C3 if necessary. */
3840 if (CHARSET_DIMENSION (charset) > 1)
3842 ONE_MORE_BYTE (c2);
3843 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3844 || ((c1 & 0x80) != (c2 & 0x80)))
3845 /* C2 is not in a valid range. */
3846 goto invalid_code;
3847 if (CHARSET_DIMENSION (charset) == 2)
3848 c1 = (c1 << 8) | c2;
3849 else
3851 ONE_MORE_BYTE (c3);
3852 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3853 || ((c1 & 0x80) != (c3 & 0x80)))
3854 /* C3 is not in a valid range. */
3855 goto invalid_code;
3856 c1 = (c1 << 16) | (c2 << 8) | c2;
3859 c1 &= 0x7F7F7F;
3860 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3861 if (c < 0)
3863 MAYBE_FINISH_COMPOSITION ();
3864 for (; src_base < src; src_base++, char_offset++)
3866 if (ASCII_BYTE_P (*src_base))
3867 *charbuf++ = *src_base;
3868 else
3869 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3872 else if (cmp_status->state == COMPOSING_NO)
3874 *charbuf++ = c;
3875 char_offset++;
3877 else if ((cmp_status->state == COMPOSING_CHAR
3878 ? cmp_status->nchars
3879 : cmp_status->ncomps)
3880 >= MAX_COMPOSITION_COMPONENTS)
3882 /* Too long composition. */
3883 MAYBE_FINISH_COMPOSITION ();
3884 *charbuf++ = c;
3885 char_offset++;
3887 else
3888 STORE_COMPOSITION_CHAR (c);
3889 continue;
3891 invalid_code:
3892 MAYBE_FINISH_COMPOSITION ();
3893 src = src_base;
3894 consumed_chars = consumed_chars_base;
3895 ONE_MORE_BYTE (c);
3896 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3897 char_offset++;
3898 coding->errors++;
3899 continue;
3901 break_loop:
3902 break;
3905 no_more_source:
3906 if (cmp_status->state != COMPOSING_NO)
3908 if (coding->mode & CODING_MODE_LAST_BLOCK)
3909 MAYBE_FINISH_COMPOSITION ();
3910 else
3912 charbuf -= cmp_status->length;
3913 for (i = 0; i < cmp_status->length; i++)
3914 cmp_status->carryover[i] = charbuf[i];
3917 else if (last_id != charset_ascii)
3918 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3919 coding->consumed_char += consumed_chars_base;
3920 coding->consumed = src_base - coding->source;
3921 coding->charbuf_used = charbuf - coding->charbuf;
3925 /* ISO2022 encoding stuff. */
3928 It is not enough to say just "ISO2022" on encoding, we have to
3929 specify more details. In Emacs, each coding system of ISO2022
3930 variant has the following specifications:
3931 1. Initial designation to G0 thru G3.
3932 2. Allows short-form designation?
3933 3. ASCII should be designated to G0 before control characters?
3934 4. ASCII should be designated to G0 at end of line?
3935 5. 7-bit environment or 8-bit environment?
3936 6. Use locking-shift?
3937 7. Use Single-shift?
3938 And the following two are only for Japanese:
3939 8. Use ASCII in place of JIS0201-1976-Roman?
3940 9. Use JISX0208-1983 in place of JISX0208-1978?
3941 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3942 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3943 details.
3946 /* Produce codes (escape sequence) for designating CHARSET to graphic
3947 register REG at DST, and increment DST. If <final-char> of CHARSET is
3948 '@', 'A', or 'B' and the coding system CODING allows, produce
3949 designation sequence of short-form. */
3951 #define ENCODE_DESIGNATION(charset, reg, coding) \
3952 do { \
3953 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3954 const char *intermediate_char_94 = "()*+"; \
3955 const char *intermediate_char_96 = ",-./"; \
3956 int revision = -1; \
3958 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3959 revision = CHARSET_ISO_REVISION (charset); \
3961 if (revision >= 0) \
3963 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3964 EMIT_ONE_BYTE ('@' + revision); \
3966 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3967 if (CHARSET_DIMENSION (charset) == 1) \
3969 int b; \
3970 if (! CHARSET_ISO_CHARS_96 (charset)) \
3971 b = intermediate_char_94[reg]; \
3972 else \
3973 b = intermediate_char_96[reg]; \
3974 EMIT_ONE_ASCII_BYTE (b); \
3976 else \
3978 EMIT_ONE_ASCII_BYTE ('$'); \
3979 if (! CHARSET_ISO_CHARS_96 (charset)) \
3981 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3982 || reg != 0 \
3983 || final_char < '@' || final_char > 'B') \
3984 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3986 else \
3987 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3989 EMIT_ONE_ASCII_BYTE (final_char); \
3991 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3992 } while (0)
3995 /* The following two macros produce codes (control character or escape
3996 sequence) for ISO2022 single-shift functions (single-shift-2 and
3997 single-shift-3). */
3999 #define ENCODE_SINGLE_SHIFT_2 \
4000 do { \
4001 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4002 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4003 else \
4004 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4005 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4006 } while (0)
4009 #define ENCODE_SINGLE_SHIFT_3 \
4010 do { \
4011 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4012 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4013 else \
4014 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4015 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4016 } while (0)
4019 /* The following four macros produce codes (control character or
4020 escape sequence) for ISO2022 locking-shift functions (shift-in,
4021 shift-out, locking-shift-2, and locking-shift-3). */
4023 #define ENCODE_SHIFT_IN \
4024 do { \
4025 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4026 CODING_ISO_INVOCATION (coding, 0) = 0; \
4027 } while (0)
4030 #define ENCODE_SHIFT_OUT \
4031 do { \
4032 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4033 CODING_ISO_INVOCATION (coding, 0) = 1; \
4034 } while (0)
4037 #define ENCODE_LOCKING_SHIFT_2 \
4038 do { \
4039 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4040 CODING_ISO_INVOCATION (coding, 0) = 2; \
4041 } while (0)
4044 #define ENCODE_LOCKING_SHIFT_3 \
4045 do { \
4046 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4047 CODING_ISO_INVOCATION (coding, 0) = 3; \
4048 } while (0)
4051 /* Produce codes for a DIMENSION1 character whose character set is
4052 CHARSET and whose position-code is C1. Designation and invocation
4053 sequences are also produced in advance if necessary. */
4055 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4056 do { \
4057 int id = CHARSET_ID (charset); \
4059 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4060 && id == charset_ascii) \
4062 id = charset_jisx0201_roman; \
4063 charset = CHARSET_FROM_ID (id); \
4066 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4068 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4069 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4070 else \
4071 EMIT_ONE_BYTE (c1 | 0x80); \
4072 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4073 break; \
4075 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4077 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4078 break; \
4080 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4082 EMIT_ONE_BYTE (c1 | 0x80); \
4083 break; \
4085 else \
4086 /* Since CHARSET is not yet invoked to any graphic planes, we \
4087 must invoke it, or, at first, designate it to some graphic \
4088 register. Then repeat the loop to actually produce the \
4089 character. */ \
4090 dst = encode_invocation_designation (charset, coding, dst, \
4091 &produced_chars); \
4092 } while (1)
4095 /* Produce codes for a DIMENSION2 character whose character set is
4096 CHARSET and whose position-codes are C1 and C2. Designation and
4097 invocation codes are also produced in advance if necessary. */
4099 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4100 do { \
4101 int id = CHARSET_ID (charset); \
4103 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4104 && id == charset_jisx0208) \
4106 id = charset_jisx0208_1978; \
4107 charset = CHARSET_FROM_ID (id); \
4110 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4113 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4114 else \
4115 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4116 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4117 break; \
4119 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4121 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4122 break; \
4124 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4126 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4127 break; \
4129 else \
4130 /* Since CHARSET is not yet invoked to any graphic planes, we \
4131 must invoke it, or, at first, designate it to some graphic \
4132 register. Then repeat the loop to actually produce the \
4133 character. */ \
4134 dst = encode_invocation_designation (charset, coding, dst, \
4135 &produced_chars); \
4136 } while (1)
4139 #define ENCODE_ISO_CHARACTER(charset, c) \
4140 do { \
4141 unsigned code; \
4142 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4144 if (CHARSET_DIMENSION (charset) == 1) \
4145 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4146 else \
4147 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4148 } while (0)
4151 /* Produce designation and invocation codes at a place pointed by DST
4152 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4153 Return new DST. */
4155 static unsigned char *
4156 encode_invocation_designation (struct charset *charset,
4157 struct coding_system *coding,
4158 unsigned char *dst, ptrdiff_t *p_nchars)
4160 bool multibytep = coding->dst_multibyte;
4161 ptrdiff_t produced_chars = *p_nchars;
4162 int reg; /* graphic register number */
4163 int id = CHARSET_ID (charset);
4165 /* At first, check designations. */
4166 for (reg = 0; reg < 4; reg++)
4167 if (id == CODING_ISO_DESIGNATION (coding, reg))
4168 break;
4170 if (reg >= 4)
4172 /* CHARSET is not yet designated to any graphic registers. */
4173 /* At first check the requested designation. */
4174 reg = CODING_ISO_REQUEST (coding, id);
4175 if (reg < 0)
4176 /* Since CHARSET requests no special designation, designate it
4177 to graphic register 0. */
4178 reg = 0;
4180 ENCODE_DESIGNATION (charset, reg, coding);
4183 if (CODING_ISO_INVOCATION (coding, 0) != reg
4184 && CODING_ISO_INVOCATION (coding, 1) != reg)
4186 /* Since the graphic register REG is not invoked to any graphic
4187 planes, invoke it to graphic plane 0. */
4188 switch (reg)
4190 case 0: /* graphic register 0 */
4191 ENCODE_SHIFT_IN;
4192 break;
4194 case 1: /* graphic register 1 */
4195 ENCODE_SHIFT_OUT;
4196 break;
4198 case 2: /* graphic register 2 */
4199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4200 ENCODE_SINGLE_SHIFT_2;
4201 else
4202 ENCODE_LOCKING_SHIFT_2;
4203 break;
4205 case 3: /* graphic register 3 */
4206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4207 ENCODE_SINGLE_SHIFT_3;
4208 else
4209 ENCODE_LOCKING_SHIFT_3;
4210 break;
4214 *p_nchars = produced_chars;
4215 return dst;
4219 /* Produce codes for designation and invocation to reset the graphic
4220 planes and registers to initial state. */
4221 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4222 do { \
4223 int reg; \
4224 struct charset *charset; \
4226 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4227 ENCODE_SHIFT_IN; \
4228 for (reg = 0; reg < 4; reg++) \
4229 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4230 && (CODING_ISO_DESIGNATION (coding, reg) \
4231 != CODING_ISO_INITIAL (coding, reg))) \
4233 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4234 ENCODE_DESIGNATION (charset, reg, coding); \
4236 } while (0)
4239 /* Produce designation sequences of charsets in the line started from
4240 CHARBUF to a place pointed by DST, and return the number of
4241 produced bytes. DST should not directly point a buffer text area
4242 which may be relocated by char_charset call.
4244 If the current block ends before any end-of-line, we may fail to
4245 find all the necessary designations. */
4247 static ptrdiff_t
4248 encode_designation_at_bol (struct coding_system *coding,
4249 int *charbuf, int *charbuf_end,
4250 unsigned char *dst)
4252 unsigned char *orig = dst;
4253 struct charset *charset;
4254 /* Table of charsets to be designated to each graphic register. */
4255 int r[4];
4256 int c, found = 0, reg;
4257 ptrdiff_t produced_chars = 0;
4258 bool multibytep = coding->dst_multibyte;
4259 Lisp_Object attrs;
4260 Lisp_Object charset_list;
4262 attrs = CODING_ID_ATTRS (coding->id);
4263 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4264 if (EQ (charset_list, Qiso_2022))
4265 charset_list = Viso_2022_charset_list;
4267 for (reg = 0; reg < 4; reg++)
4268 r[reg] = -1;
4270 while (charbuf < charbuf_end && found < 4)
4272 int id;
4274 c = *charbuf++;
4275 if (c == '\n')
4276 break;
4277 charset = char_charset (c, charset_list, NULL);
4278 id = CHARSET_ID (charset);
4279 reg = CODING_ISO_REQUEST (coding, id);
4280 if (reg >= 0 && r[reg] < 0)
4282 found++;
4283 r[reg] = id;
4287 if (found)
4289 for (reg = 0; reg < 4; reg++)
4290 if (r[reg] >= 0
4291 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4292 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4295 return dst - orig;
4298 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4300 static bool
4301 encode_coding_iso_2022 (struct coding_system *coding)
4303 bool multibytep = coding->dst_multibyte;
4304 int *charbuf = coding->charbuf;
4305 int *charbuf_end = charbuf + coding->charbuf_used;
4306 unsigned char *dst = coding->destination + coding->produced;
4307 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4308 int safe_room = 16;
4309 bool bol_designation
4310 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4311 && CODING_ISO_BOL (coding));
4312 ptrdiff_t produced_chars = 0;
4313 Lisp_Object attrs, eol_type, charset_list;
4314 bool ascii_compatible;
4315 int c;
4316 int preferred_charset_id = -1;
4318 CODING_GET_INFO (coding, attrs, charset_list);
4319 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4320 if (VECTORP (eol_type))
4321 eol_type = Qunix;
4323 setup_iso_safe_charsets (attrs);
4324 /* Charset list may have been changed. */
4325 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4326 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4328 ascii_compatible
4329 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4330 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4331 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4333 while (charbuf < charbuf_end)
4335 ASSURE_DESTINATION (safe_room);
4337 if (bol_designation)
4339 /* We have to produce designation sequences if any now. */
4340 unsigned char desig_buf[16];
4341 int nbytes;
4342 ptrdiff_t offset;
4344 charset_map_loaded = 0;
4345 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4346 desig_buf);
4347 if (charset_map_loaded
4348 && (offset = coding_change_destination (coding)))
4350 dst += offset;
4351 dst_end += offset;
4353 memcpy (dst, desig_buf, nbytes);
4354 dst += nbytes;
4355 /* We are sure that designation sequences are all ASCII bytes. */
4356 produced_chars += nbytes;
4357 bol_designation = 0;
4358 ASSURE_DESTINATION (safe_room);
4361 c = *charbuf++;
4363 if (c < 0)
4365 /* Handle an annotation. */
4366 switch (*charbuf)
4368 case CODING_ANNOTATE_COMPOSITION_MASK:
4369 /* Not yet implemented. */
4370 break;
4371 case CODING_ANNOTATE_CHARSET_MASK:
4372 preferred_charset_id = charbuf[2];
4373 if (preferred_charset_id >= 0
4374 && NILP (Fmemq (make_number (preferred_charset_id),
4375 charset_list)))
4376 preferred_charset_id = -1;
4377 break;
4378 default:
4379 emacs_abort ();
4381 charbuf += -c - 1;
4382 continue;
4385 /* Now encode the character C. */
4386 if (c < 0x20 || c == 0x7F)
4388 if (c == '\n'
4389 || (c == '\r' && EQ (eol_type, Qmac)))
4391 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4392 ENCODE_RESET_PLANE_AND_REGISTER ();
4393 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4395 int i;
4397 for (i = 0; i < 4; i++)
4398 CODING_ISO_DESIGNATION (coding, i)
4399 = CODING_ISO_INITIAL (coding, i);
4401 bol_designation = ((CODING_ISO_FLAGS (coding)
4402 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4403 != 0);
4405 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4406 ENCODE_RESET_PLANE_AND_REGISTER ();
4407 EMIT_ONE_ASCII_BYTE (c);
4409 else if (ASCII_CHAR_P (c))
4411 if (ascii_compatible)
4412 EMIT_ONE_ASCII_BYTE (c);
4413 else
4415 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4416 ENCODE_ISO_CHARACTER (charset, c);
4419 else if (CHAR_BYTE8_P (c))
4421 c = CHAR_TO_BYTE8 (c);
4422 EMIT_ONE_BYTE (c);
4424 else
4426 struct charset *charset;
4428 if (preferred_charset_id >= 0)
4430 bool result;
4432 charset = CHARSET_FROM_ID (preferred_charset_id);
4433 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4434 if (! result)
4435 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4436 NULL, charset);
4438 else
4439 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4440 NULL, charset);
4441 if (!charset)
4443 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4445 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4446 charset = CHARSET_FROM_ID (charset_ascii);
4448 else
4450 c = coding->default_char;
4451 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4452 charset_list, NULL, charset);
4455 ENCODE_ISO_CHARACTER (charset, c);
4459 if (coding->mode & CODING_MODE_LAST_BLOCK
4460 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4462 ASSURE_DESTINATION (safe_room);
4463 ENCODE_RESET_PLANE_AND_REGISTER ();
4465 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4466 CODING_ISO_BOL (coding) = bol_designation;
4467 coding->produced_char += produced_chars;
4468 coding->produced = dst - coding->destination;
4469 return 0;
4473 /*** 8,9. SJIS and BIG5 handlers ***/
4475 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4476 quite widely. So, for the moment, Emacs supports them in the bare
4477 C code. But, in the future, they may be supported only by CCL. */
4479 /* SJIS is a coding system encoding three character sets: ASCII, right
4480 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4481 as is. A character of charset katakana-jisx0201 is encoded by
4482 "position-code + 0x80". A character of charset japanese-jisx0208
4483 is encoded in 2-byte but two position-codes are divided and shifted
4484 so that it fit in the range below.
4486 --- CODE RANGE of SJIS ---
4487 (character set) (range)
4488 ASCII 0x00 .. 0x7F
4489 KATAKANA-JISX0201 0xA0 .. 0xDF
4490 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4491 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4492 -------------------------------
4496 /* BIG5 is a coding system encoding two character sets: ASCII and
4497 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4498 character set and is encoded in two-byte.
4500 --- CODE RANGE of BIG5 ---
4501 (character set) (range)
4502 ASCII 0x00 .. 0x7F
4503 Big5 (1st byte) 0xA1 .. 0xFE
4504 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4505 --------------------------
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510 Return true if a text is encoded in SJIS. */
4512 static bool
4513 detect_coding_sjis (struct coding_system *coding,
4514 struct coding_detection_info *detect_info)
4516 const unsigned char *src = coding->source, *src_base;
4517 const unsigned char *src_end = coding->source + coding->src_bytes;
4518 bool multibytep = coding->src_multibyte;
4519 ptrdiff_t consumed_chars = 0;
4520 int found = 0;
4521 int c;
4522 Lisp_Object attrs, charset_list;
4523 int max_first_byte_of_2_byte_code;
4525 CODING_GET_INFO (coding, attrs, charset_list);
4526 max_first_byte_of_2_byte_code
4527 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4529 detect_info->checked |= CATEGORY_MASK_SJIS;
4530 /* A coding system of this category is always ASCII compatible. */
4531 src += coding->head_ascii;
4533 while (1)
4535 src_base = src;
4536 ONE_MORE_BYTE (c);
4537 if (c < 0x80)
4538 continue;
4539 if ((c >= 0x81 && c <= 0x9F)
4540 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4542 ONE_MORE_BYTE (c);
4543 if (c < 0x40 || c == 0x7F || c > 0xFC)
4544 break;
4545 found = CATEGORY_MASK_SJIS;
4547 else if (c >= 0xA0 && c < 0xE0)
4548 found = CATEGORY_MASK_SJIS;
4549 else
4550 break;
4552 detect_info->rejected |= CATEGORY_MASK_SJIS;
4553 return 0;
4555 no_more_source:
4556 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4558 detect_info->rejected |= CATEGORY_MASK_SJIS;
4559 return 0;
4561 detect_info->found |= found;
4562 return 1;
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Return true if a text is encoded in BIG5. */
4568 static bool
4569 detect_coding_big5 (struct coding_system *coding,
4570 struct coding_detection_info *detect_info)
4572 const unsigned char *src = coding->source, *src_base;
4573 const unsigned char *src_end = coding->source + coding->src_bytes;
4574 bool multibytep = coding->src_multibyte;
4575 ptrdiff_t consumed_chars = 0;
4576 int found = 0;
4577 int c;
4579 detect_info->checked |= CATEGORY_MASK_BIG5;
4580 /* A coding system of this category is always ASCII compatible. */
4581 src += coding->head_ascii;
4583 while (1)
4585 src_base = src;
4586 ONE_MORE_BYTE (c);
4587 if (c < 0x80)
4588 continue;
4589 if (c >= 0xA1)
4591 ONE_MORE_BYTE (c);
4592 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4593 return 0;
4594 found = CATEGORY_MASK_BIG5;
4596 else
4597 break;
4599 detect_info->rejected |= CATEGORY_MASK_BIG5;
4600 return 0;
4602 no_more_source:
4603 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4605 detect_info->rejected |= CATEGORY_MASK_BIG5;
4606 return 0;
4608 detect_info->found |= found;
4609 return 1;
4612 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4614 static void
4615 decode_coding_sjis (struct coding_system *coding)
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
4620 int *charbuf = coding->charbuf + coding->charbuf_used;
4621 /* We may produce one charset annotation in one loop and one more at
4622 the end. */
4623 int *charbuf_end
4624 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4625 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4626 bool multibytep = coding->src_multibyte;
4627 struct charset *charset_roman, *charset_kanji, *charset_kana;
4628 struct charset *charset_kanji2;
4629 Lisp_Object attrs, charset_list, val;
4630 ptrdiff_t char_offset = coding->produced_char;
4631 ptrdiff_t last_offset = char_offset;
4632 int last_id = charset_ascii;
4633 bool eol_dos
4634 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4635 int byte_after_cr = -1;
4637 CODING_GET_INFO (coding, attrs, charset_list);
4639 val = charset_list;
4640 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4643 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4645 while (1)
4647 int c, c1;
4648 struct charset *charset;
4650 src_base = src;
4651 consumed_chars_base = consumed_chars;
4653 if (charbuf >= charbuf_end)
4655 if (byte_after_cr >= 0)
4656 src_base--;
4657 break;
4660 if (byte_after_cr >= 0)
4661 c = byte_after_cr, byte_after_cr = -1;
4662 else
4663 ONE_MORE_BYTE (c);
4664 if (c < 0)
4665 goto invalid_code;
4666 if (c < 0x80)
4668 if (eol_dos && c == '\r')
4669 ONE_MORE_BYTE (byte_after_cr);
4670 charset = charset_roman;
4672 else if (c == 0x80 || c == 0xA0)
4673 goto invalid_code;
4674 else if (c >= 0xA1 && c <= 0xDF)
4676 /* SJIS -> JISX0201-Kana */
4677 c &= 0x7F;
4678 charset = charset_kana;
4680 else if (c <= 0xEF)
4682 /* SJIS -> JISX0208 */
4683 ONE_MORE_BYTE (c1);
4684 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4685 goto invalid_code;
4686 c = (c << 8) | c1;
4687 SJIS_TO_JIS (c);
4688 charset = charset_kanji;
4690 else if (c <= 0xFC && charset_kanji2)
4692 /* SJIS -> JISX0213-2 */
4693 ONE_MORE_BYTE (c1);
4694 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4695 goto invalid_code;
4696 c = (c << 8) | c1;
4697 SJIS_TO_JIS2 (c);
4698 charset = charset_kanji2;
4700 else
4701 goto invalid_code;
4702 if (charset->id != charset_ascii
4703 && last_id != charset->id)
4705 if (last_id != charset_ascii)
4706 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4707 last_id = charset->id;
4708 last_offset = char_offset;
4710 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4711 *charbuf++ = c;
4712 char_offset++;
4713 continue;
4715 invalid_code:
4716 src = src_base;
4717 consumed_chars = consumed_chars_base;
4718 ONE_MORE_BYTE (c);
4719 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4720 char_offset++;
4721 coding->errors++;
4724 no_more_source:
4725 if (last_id != charset_ascii)
4726 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4727 coding->consumed_char += consumed_chars_base;
4728 coding->consumed = src_base - coding->source;
4729 coding->charbuf_used = charbuf - coding->charbuf;
4732 static void
4733 decode_coding_big5 (struct coding_system *coding)
4735 const unsigned char *src = coding->source + coding->consumed;
4736 const unsigned char *src_end = coding->source + coding->src_bytes;
4737 const unsigned char *src_base;
4738 int *charbuf = coding->charbuf + coding->charbuf_used;
4739 /* We may produce one charset annotation in one loop and one more at
4740 the end. */
4741 int *charbuf_end
4742 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4743 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4744 bool multibytep = coding->src_multibyte;
4745 struct charset *charset_roman, *charset_big5;
4746 Lisp_Object attrs, charset_list, val;
4747 ptrdiff_t char_offset = coding->produced_char;
4748 ptrdiff_t last_offset = char_offset;
4749 int last_id = charset_ascii;
4750 bool eol_dos
4751 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4752 int byte_after_cr = -1;
4754 CODING_GET_INFO (coding, attrs, charset_list);
4755 val = charset_list;
4756 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4757 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4759 while (1)
4761 int c, c1;
4762 struct charset *charset;
4764 src_base = src;
4765 consumed_chars_base = consumed_chars;
4767 if (charbuf >= charbuf_end)
4769 if (byte_after_cr >= 0)
4770 src_base--;
4771 break;
4774 if (byte_after_cr >= 0)
4775 c = byte_after_cr, byte_after_cr = -1;
4776 else
4777 ONE_MORE_BYTE (c);
4779 if (c < 0)
4780 goto invalid_code;
4781 if (c < 0x80)
4783 if (eol_dos && c == '\r')
4784 ONE_MORE_BYTE (byte_after_cr);
4785 charset = charset_roman;
4787 else
4789 /* BIG5 -> Big5 */
4790 if (c < 0xA1 || c > 0xFE)
4791 goto invalid_code;
4792 ONE_MORE_BYTE (c1);
4793 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4794 goto invalid_code;
4795 c = c << 8 | c1;
4796 charset = charset_big5;
4798 if (charset->id != charset_ascii
4799 && last_id != charset->id)
4801 if (last_id != charset_ascii)
4802 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4803 last_id = charset->id;
4804 last_offset = char_offset;
4806 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4807 *charbuf++ = c;
4808 char_offset++;
4809 continue;
4811 invalid_code:
4812 src = src_base;
4813 consumed_chars = consumed_chars_base;
4814 ONE_MORE_BYTE (c);
4815 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4816 char_offset++;
4817 coding->errors++;
4820 no_more_source:
4821 if (last_id != charset_ascii)
4822 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4823 coding->consumed_char += consumed_chars_base;
4824 coding->consumed = src_base - coding->source;
4825 coding->charbuf_used = charbuf - coding->charbuf;
4828 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4829 This function can encode charsets `ascii', `katakana-jisx0201',
4830 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4831 are sure that all these charsets are registered as official charset
4832 (i.e. do not have extended leading-codes). Characters of other
4833 charsets are produced without any encoding. */
4835 static bool
4836 encode_coding_sjis (struct coding_system *coding)
4838 bool multibytep = coding->dst_multibyte;
4839 int *charbuf = coding->charbuf;
4840 int *charbuf_end = charbuf + coding->charbuf_used;
4841 unsigned char *dst = coding->destination + coding->produced;
4842 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4843 int safe_room = 4;
4844 ptrdiff_t produced_chars = 0;
4845 Lisp_Object attrs, charset_list, val;
4846 bool ascii_compatible;
4847 struct charset *charset_kanji, *charset_kana;
4848 struct charset *charset_kanji2;
4849 int c;
4851 CODING_GET_INFO (coding, attrs, charset_list);
4852 val = XCDR (charset_list);
4853 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4855 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4857 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4859 while (charbuf < charbuf_end)
4861 ASSURE_DESTINATION (safe_room);
4862 c = *charbuf++;
4863 /* Now encode the character C. */
4864 if (ASCII_CHAR_P (c) && ascii_compatible)
4865 EMIT_ONE_ASCII_BYTE (c);
4866 else if (CHAR_BYTE8_P (c))
4868 c = CHAR_TO_BYTE8 (c);
4869 EMIT_ONE_BYTE (c);
4871 else
4873 unsigned code;
4874 struct charset *charset;
4875 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4876 &code, charset);
4878 if (!charset)
4880 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4882 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4883 charset = CHARSET_FROM_ID (charset_ascii);
4885 else
4887 c = coding->default_char;
4888 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4889 charset_list, &code, charset);
4892 if (code == CHARSET_INVALID_CODE (charset))
4893 emacs_abort ();
4894 if (charset == charset_kanji)
4896 int c1, c2;
4897 JIS_TO_SJIS (code);
4898 c1 = code >> 8, c2 = code & 0xFF;
4899 EMIT_TWO_BYTES (c1, c2);
4901 else if (charset == charset_kana)
4902 EMIT_ONE_BYTE (code | 0x80);
4903 else if (charset_kanji2 && charset == charset_kanji2)
4905 int c1, c2;
4907 c1 = code >> 8;
4908 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4909 || c1 == 0x28
4910 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4912 JIS_TO_SJIS2 (code);
4913 c1 = code >> 8, c2 = code & 0xFF;
4914 EMIT_TWO_BYTES (c1, c2);
4916 else
4917 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4919 else
4920 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4923 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4924 coding->produced_char += produced_chars;
4925 coding->produced = dst - coding->destination;
4926 return 0;
4929 static bool
4930 encode_coding_big5 (struct coding_system *coding)
4932 bool multibytep = coding->dst_multibyte;
4933 int *charbuf = coding->charbuf;
4934 int *charbuf_end = charbuf + coding->charbuf_used;
4935 unsigned char *dst = coding->destination + coding->produced;
4936 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4937 int safe_room = 4;
4938 ptrdiff_t produced_chars = 0;
4939 Lisp_Object attrs, charset_list, val;
4940 bool ascii_compatible;
4941 struct charset *charset_big5;
4942 int c;
4944 CODING_GET_INFO (coding, attrs, charset_list);
4945 val = XCDR (charset_list);
4946 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4947 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4949 while (charbuf < charbuf_end)
4951 ASSURE_DESTINATION (safe_room);
4952 c = *charbuf++;
4953 /* Now encode the character C. */
4954 if (ASCII_CHAR_P (c) && ascii_compatible)
4955 EMIT_ONE_ASCII_BYTE (c);
4956 else if (CHAR_BYTE8_P (c))
4958 c = CHAR_TO_BYTE8 (c);
4959 EMIT_ONE_BYTE (c);
4961 else
4963 unsigned code;
4964 struct charset *charset;
4965 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4966 &code, charset);
4968 if (! charset)
4970 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4972 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4973 charset = CHARSET_FROM_ID (charset_ascii);
4975 else
4977 c = coding->default_char;
4978 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4979 charset_list, &code, charset);
4982 if (code == CHARSET_INVALID_CODE (charset))
4983 emacs_abort ();
4984 if (charset == charset_big5)
4986 int c1, c2;
4988 c1 = code >> 8, c2 = code & 0xFF;
4989 EMIT_TWO_BYTES (c1, c2);
4991 else
4992 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4995 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4996 coding->produced_char += produced_chars;
4997 coding->produced = dst - coding->destination;
4998 return 0;
5002 /*** 10. CCL handlers ***/
5004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5005 Return true if a text is encoded in a coding system of which
5006 encoder/decoder are written in CCL program. */
5008 static bool
5009 detect_coding_ccl (struct coding_system *coding,
5010 struct coding_detection_info *detect_info)
5012 const unsigned char *src = coding->source, *src_base;
5013 const unsigned char *src_end = coding->source + coding->src_bytes;
5014 bool multibytep = coding->src_multibyte;
5015 ptrdiff_t consumed_chars = 0;
5016 int found = 0;
5017 unsigned char *valids;
5018 ptrdiff_t head_ascii = coding->head_ascii;
5019 Lisp_Object attrs;
5021 detect_info->checked |= CATEGORY_MASK_CCL;
5023 coding = &coding_categories[coding_category_ccl];
5024 valids = CODING_CCL_VALIDS (coding);
5025 attrs = CODING_ID_ATTRS (coding->id);
5026 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5027 src += head_ascii;
5029 while (1)
5031 int c;
5033 src_base = src;
5034 ONE_MORE_BYTE (c);
5035 if (c < 0 || ! valids[c])
5036 break;
5037 if ((valids[c] > 1))
5038 found = CATEGORY_MASK_CCL;
5040 detect_info->rejected |= CATEGORY_MASK_CCL;
5041 return 0;
5043 no_more_source:
5044 detect_info->found |= found;
5045 return 1;
5048 static void
5049 decode_coding_ccl (struct coding_system *coding)
5051 const unsigned char *src = coding->source + coding->consumed;
5052 const unsigned char *src_end = coding->source + coding->src_bytes;
5053 int *charbuf = coding->charbuf + coding->charbuf_used;
5054 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5055 ptrdiff_t consumed_chars = 0;
5056 bool multibytep = coding->src_multibyte;
5057 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5058 int source_charbuf[1024];
5059 int source_byteidx[1025];
5060 Lisp_Object attrs, charset_list;
5062 CODING_GET_INFO (coding, attrs, charset_list);
5064 while (1)
5066 const unsigned char *p = src;
5067 ptrdiff_t offset;
5068 int i = 0;
5070 if (multibytep)
5072 while (i < 1024 && p < src_end)
5074 source_byteidx[i] = p - src;
5075 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5077 source_byteidx[i] = p - src;
5079 else
5080 while (i < 1024 && p < src_end)
5081 source_charbuf[i++] = *p++;
5083 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5084 ccl->last_block = 1;
5085 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5086 charset_map_loaded = 0;
5087 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5088 charset_list);
5089 if (charset_map_loaded
5090 && (offset = coding_change_source (coding)))
5092 p += offset;
5093 src += offset;
5094 src_end += offset;
5096 charbuf += ccl->produced;
5097 if (multibytep)
5098 src += source_byteidx[ccl->consumed];
5099 else
5100 src += ccl->consumed;
5101 consumed_chars += ccl->consumed;
5102 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5103 break;
5106 switch (ccl->status)
5108 case CCL_STAT_SUSPEND_BY_SRC:
5109 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5110 break;
5111 case CCL_STAT_SUSPEND_BY_DST:
5112 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5113 break;
5114 case CCL_STAT_QUIT:
5115 case CCL_STAT_INVALID_CMD:
5116 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5117 break;
5118 default:
5119 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5120 break;
5122 coding->consumed_char += consumed_chars;
5123 coding->consumed = src - coding->source;
5124 coding->charbuf_used = charbuf - coding->charbuf;
5127 static bool
5128 encode_coding_ccl (struct coding_system *coding)
5130 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5131 bool multibytep = coding->dst_multibyte;
5132 int *charbuf = coding->charbuf;
5133 int *charbuf_end = charbuf + coding->charbuf_used;
5134 unsigned char *dst = coding->destination + coding->produced;
5135 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5136 int destination_charbuf[1024];
5137 ptrdiff_t produced_chars = 0;
5138 int i;
5139 Lisp_Object attrs, charset_list;
5141 CODING_GET_INFO (coding, attrs, charset_list);
5142 if (coding->consumed_char == coding->src_chars
5143 && coding->mode & CODING_MODE_LAST_BLOCK)
5144 ccl->last_block = 1;
5148 ptrdiff_t offset;
5150 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5151 charset_map_loaded = 0;
5152 ccl_driver (ccl, charbuf, destination_charbuf,
5153 charbuf_end - charbuf, 1024, charset_list);
5154 if (charset_map_loaded
5155 && (offset = coding_change_destination (coding)))
5156 dst += offset;
5157 if (multibytep)
5159 ASSURE_DESTINATION (ccl->produced * 2);
5160 for (i = 0; i < ccl->produced; i++)
5161 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5163 else
5165 ASSURE_DESTINATION (ccl->produced);
5166 for (i = 0; i < ccl->produced; i++)
5167 *dst++ = destination_charbuf[i] & 0xFF;
5168 produced_chars += ccl->produced;
5170 charbuf += ccl->consumed;
5171 if (ccl->status == CCL_STAT_QUIT
5172 || ccl->status == CCL_STAT_INVALID_CMD)
5173 break;
5175 while (charbuf < charbuf_end);
5177 switch (ccl->status)
5179 case CCL_STAT_SUSPEND_BY_SRC:
5180 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181 break;
5182 case CCL_STAT_SUSPEND_BY_DST:
5183 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184 break;
5185 case CCL_STAT_QUIT:
5186 case CCL_STAT_INVALID_CMD:
5187 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188 break;
5189 default:
5190 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191 break;
5194 coding->produced_char += produced_chars;
5195 coding->produced = dst - coding->destination;
5196 return 0;
5200 /*** 10, 11. no-conversion handlers ***/
5202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5204 static void
5205 decode_coding_raw_text (struct coding_system *coding)
5207 bool eol_dos
5208 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5210 coding->chars_at_source = 1;
5211 coding->consumed_char = coding->src_chars;
5212 coding->consumed = coding->src_bytes;
5213 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5215 coding->consumed_char--;
5216 coding->consumed--;
5217 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5219 else
5220 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5223 static bool
5224 encode_coding_raw_text (struct coding_system *coding)
5226 bool multibytep = coding->dst_multibyte;
5227 int *charbuf = coding->charbuf;
5228 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5229 unsigned char *dst = coding->destination + coding->produced;
5230 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5231 ptrdiff_t produced_chars = 0;
5232 int c;
5234 if (multibytep)
5236 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5238 if (coding->src_multibyte)
5239 while (charbuf < charbuf_end)
5241 ASSURE_DESTINATION (safe_room);
5242 c = *charbuf++;
5243 if (ASCII_CHAR_P (c))
5244 EMIT_ONE_ASCII_BYTE (c);
5245 else if (CHAR_BYTE8_P (c))
5247 c = CHAR_TO_BYTE8 (c);
5248 EMIT_ONE_BYTE (c);
5250 else
5252 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5254 CHAR_STRING_ADVANCE (c, p1);
5257 EMIT_ONE_BYTE (*p0);
5258 p0++;
5260 while (p0 < p1);
5263 else
5264 while (charbuf < charbuf_end)
5266 ASSURE_DESTINATION (safe_room);
5267 c = *charbuf++;
5268 EMIT_ONE_BYTE (c);
5271 else
5273 if (coding->src_multibyte)
5275 int safe_room = MAX_MULTIBYTE_LENGTH;
5277 while (charbuf < charbuf_end)
5279 ASSURE_DESTINATION (safe_room);
5280 c = *charbuf++;
5281 if (ASCII_CHAR_P (c))
5282 *dst++ = c;
5283 else if (CHAR_BYTE8_P (c))
5284 *dst++ = CHAR_TO_BYTE8 (c);
5285 else
5286 CHAR_STRING_ADVANCE (c, dst);
5289 else
5291 ASSURE_DESTINATION (charbuf_end - charbuf);
5292 while (charbuf < charbuf_end && dst < dst_end)
5293 *dst++ = *charbuf++;
5295 produced_chars = dst - (coding->destination + coding->produced);
5297 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5298 coding->produced_char += produced_chars;
5299 coding->produced = dst - coding->destination;
5300 return 0;
5303 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5304 Return true if a text is encoded in a charset-based coding system. */
5306 static bool
5307 detect_coding_charset (struct coding_system *coding,
5308 struct coding_detection_info *detect_info)
5310 const unsigned char *src = coding->source, *src_base;
5311 const unsigned char *src_end = coding->source + coding->src_bytes;
5312 bool multibytep = coding->src_multibyte;
5313 ptrdiff_t consumed_chars = 0;
5314 Lisp_Object attrs, valids, name;
5315 int found = 0;
5316 ptrdiff_t head_ascii = coding->head_ascii;
5317 bool check_latin_extra = 0;
5319 detect_info->checked |= CATEGORY_MASK_CHARSET;
5321 coding = &coding_categories[coding_category_charset];
5322 attrs = CODING_ID_ATTRS (coding->id);
5323 valids = AREF (attrs, coding_attr_charset_valids);
5324 name = CODING_ID_NAME (coding->id);
5325 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5326 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5327 || strncmp (SSDATA (SYMBOL_NAME (name)),
5328 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5329 check_latin_extra = 1;
5331 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5332 src += head_ascii;
5334 while (1)
5336 int c;
5337 Lisp_Object val;
5338 struct charset *charset;
5339 int dim, idx;
5341 src_base = src;
5342 ONE_MORE_BYTE (c);
5343 if (c < 0)
5344 continue;
5345 val = AREF (valids, c);
5346 if (NILP (val))
5347 break;
5348 if (c >= 0x80)
5350 if (c < 0xA0
5351 && check_latin_extra
5352 && (!VECTORP (Vlatin_extra_code_table)
5353 || NILP (AREF (Vlatin_extra_code_table, c))))
5354 break;
5355 found = CATEGORY_MASK_CHARSET;
5357 if (INTEGERP (val))
5359 charset = CHARSET_FROM_ID (XFASTINT (val));
5360 dim = CHARSET_DIMENSION (charset);
5361 for (idx = 1; idx < dim; idx++)
5363 if (src == src_end)
5364 goto too_short;
5365 ONE_MORE_BYTE (c);
5366 if (c < charset->code_space[(dim - 1 - idx) * 4]
5367 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5368 break;
5370 if (idx < dim)
5371 break;
5373 else
5375 idx = 1;
5376 for (; CONSP (val); val = XCDR (val))
5378 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5379 dim = CHARSET_DIMENSION (charset);
5380 while (idx < dim)
5382 if (src == src_end)
5383 goto too_short;
5384 ONE_MORE_BYTE (c);
5385 if (c < charset->code_space[(dim - 1 - idx) * 4]
5386 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5387 break;
5388 idx++;
5390 if (idx == dim)
5392 val = Qnil;
5393 break;
5396 if (CONSP (val))
5397 break;
5400 too_short:
5401 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5402 return 0;
5404 no_more_source:
5405 detect_info->found |= found;
5406 return 1;
5409 static void
5410 decode_coding_charset (struct coding_system *coding)
5412 const unsigned char *src = coding->source + coding->consumed;
5413 const unsigned char *src_end = coding->source + coding->src_bytes;
5414 const unsigned char *src_base;
5415 int *charbuf = coding->charbuf + coding->charbuf_used;
5416 /* We may produce one charset annotation in one loop and one more at
5417 the end. */
5418 int *charbuf_end
5419 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5420 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5421 bool multibytep = coding->src_multibyte;
5422 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5423 Lisp_Object valids;
5424 ptrdiff_t char_offset = coding->produced_char;
5425 ptrdiff_t last_offset = char_offset;
5426 int last_id = charset_ascii;
5427 bool eol_dos
5428 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5429 int byte_after_cr = -1;
5431 valids = AREF (attrs, coding_attr_charset_valids);
5433 while (1)
5435 int c;
5436 Lisp_Object val;
5437 struct charset *charset;
5438 int dim;
5439 int len = 1;
5440 unsigned code;
5442 src_base = src;
5443 consumed_chars_base = consumed_chars;
5445 if (charbuf >= charbuf_end)
5447 if (byte_after_cr >= 0)
5448 src_base--;
5449 break;
5452 if (byte_after_cr >= 0)
5454 c = byte_after_cr;
5455 byte_after_cr = -1;
5457 else
5459 ONE_MORE_BYTE (c);
5460 if (eol_dos && c == '\r')
5461 ONE_MORE_BYTE (byte_after_cr);
5463 if (c < 0)
5464 goto invalid_code;
5465 code = c;
5467 val = AREF (valids, c);
5468 if (! INTEGERP (val) && ! CONSP (val))
5469 goto invalid_code;
5470 if (INTEGERP (val))
5472 charset = CHARSET_FROM_ID (XFASTINT (val));
5473 dim = CHARSET_DIMENSION (charset);
5474 while (len < dim)
5476 ONE_MORE_BYTE (c);
5477 code = (code << 8) | c;
5478 len++;
5480 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5481 charset, code, c);
5483 else
5485 /* VAL is a list of charset IDs. It is assured that the
5486 list is sorted by charset dimensions (smaller one
5487 comes first). */
5488 while (CONSP (val))
5490 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5491 dim = CHARSET_DIMENSION (charset);
5492 while (len < dim)
5494 ONE_MORE_BYTE (c);
5495 code = (code << 8) | c;
5496 len++;
5498 CODING_DECODE_CHAR (coding, src, src_base,
5499 src_end, charset, code, c);
5500 if (c >= 0)
5501 break;
5502 val = XCDR (val);
5505 if (c < 0)
5506 goto invalid_code;
5507 if (charset->id != charset_ascii
5508 && last_id != charset->id)
5510 if (last_id != charset_ascii)
5511 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5512 last_id = charset->id;
5513 last_offset = char_offset;
5516 *charbuf++ = c;
5517 char_offset++;
5518 continue;
5520 invalid_code:
5521 src = src_base;
5522 consumed_chars = consumed_chars_base;
5523 ONE_MORE_BYTE (c);
5524 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5525 char_offset++;
5526 coding->errors++;
5529 no_more_source:
5530 if (last_id != charset_ascii)
5531 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5532 coding->consumed_char += consumed_chars_base;
5533 coding->consumed = src_base - coding->source;
5534 coding->charbuf_used = charbuf - coding->charbuf;
5537 static bool
5538 encode_coding_charset (struct coding_system *coding)
5540 bool multibytep = coding->dst_multibyte;
5541 int *charbuf = coding->charbuf;
5542 int *charbuf_end = charbuf + coding->charbuf_used;
5543 unsigned char *dst = coding->destination + coding->produced;
5544 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5545 int safe_room = MAX_MULTIBYTE_LENGTH;
5546 ptrdiff_t produced_chars = 0;
5547 Lisp_Object attrs, charset_list;
5548 bool ascii_compatible;
5549 int c;
5551 CODING_GET_INFO (coding, attrs, charset_list);
5552 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5554 while (charbuf < charbuf_end)
5556 struct charset *charset;
5557 unsigned code;
5559 ASSURE_DESTINATION (safe_room);
5560 c = *charbuf++;
5561 if (ascii_compatible && ASCII_CHAR_P (c))
5562 EMIT_ONE_ASCII_BYTE (c);
5563 else if (CHAR_BYTE8_P (c))
5565 c = CHAR_TO_BYTE8 (c);
5566 EMIT_ONE_BYTE (c);
5568 else
5570 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5571 &code, charset);
5573 if (charset)
5575 if (CHARSET_DIMENSION (charset) == 1)
5576 EMIT_ONE_BYTE (code);
5577 else if (CHARSET_DIMENSION (charset) == 2)
5578 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5579 else if (CHARSET_DIMENSION (charset) == 3)
5580 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5581 else
5582 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5583 (code >> 8) & 0xFF, code & 0xFF);
5585 else
5587 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5588 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5589 else
5590 c = coding->default_char;
5591 EMIT_ONE_BYTE (c);
5596 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5597 coding->produced_char += produced_chars;
5598 coding->produced = dst - coding->destination;
5599 return 0;
5603 /*** 7. C library functions ***/
5605 /* Setup coding context CODING from information about CODING_SYSTEM.
5606 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5607 CODING_SYSTEM is invalid, signal an error. */
5609 void
5610 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5612 Lisp_Object attrs;
5613 Lisp_Object eol_type;
5614 Lisp_Object coding_type;
5615 Lisp_Object val;
5617 if (NILP (coding_system))
5618 coding_system = Qundecided;
5620 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5622 attrs = CODING_ID_ATTRS (coding->id);
5623 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5625 coding->mode = 0;
5626 coding->head_ascii = -1;
5627 if (VECTORP (eol_type))
5628 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5629 | CODING_REQUIRE_DETECTION_MASK);
5630 else if (! EQ (eol_type, Qunix))
5631 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632 | CODING_REQUIRE_ENCODING_MASK);
5633 else
5634 coding->common_flags = 0;
5635 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5636 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5637 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5638 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5639 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5640 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5642 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5643 coding->max_charset_id = SCHARS (val) - 1;
5644 coding->safe_charsets = SDATA (val);
5645 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5646 coding->carryover_bytes = 0;
5648 coding_type = CODING_ATTR_TYPE (attrs);
5649 if (EQ (coding_type, Qundecided))
5651 coding->detector = NULL;
5652 coding->decoder = decode_coding_raw_text;
5653 coding->encoder = encode_coding_raw_text;
5654 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5656 else if (EQ (coding_type, Qiso_2022))
5658 int i;
5659 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5661 /* Invoke graphic register 0 to plane 0. */
5662 CODING_ISO_INVOCATION (coding, 0) = 0;
5663 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5664 CODING_ISO_INVOCATION (coding, 1)
5665 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5666 /* Setup the initial status of designation. */
5667 for (i = 0; i < 4; i++)
5668 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5669 /* Not single shifting initially. */
5670 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5671 /* Beginning of buffer should also be regarded as bol. */
5672 CODING_ISO_BOL (coding) = 1;
5673 coding->detector = detect_coding_iso_2022;
5674 coding->decoder = decode_coding_iso_2022;
5675 coding->encoder = encode_coding_iso_2022;
5676 if (flags & CODING_ISO_FLAG_SAFE)
5677 coding->mode |= CODING_MODE_SAFE_ENCODING;
5678 coding->common_flags
5679 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5680 | CODING_REQUIRE_FLUSHING_MASK);
5681 if (flags & CODING_ISO_FLAG_COMPOSITION)
5682 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5683 if (flags & CODING_ISO_FLAG_DESIGNATION)
5684 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5685 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5687 setup_iso_safe_charsets (attrs);
5688 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5689 coding->max_charset_id = SCHARS (val) - 1;
5690 coding->safe_charsets = SDATA (val);
5692 CODING_ISO_FLAGS (coding) = flags;
5693 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5694 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5695 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5696 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5698 else if (EQ (coding_type, Qcharset))
5700 coding->detector = detect_coding_charset;
5701 coding->decoder = decode_coding_charset;
5702 coding->encoder = encode_coding_charset;
5703 coding->common_flags
5704 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5706 else if (EQ (coding_type, Qutf_8))
5708 val = AREF (attrs, coding_attr_utf_bom);
5709 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5710 : EQ (val, Qt) ? utf_with_bom
5711 : utf_without_bom);
5712 coding->detector = detect_coding_utf_8;
5713 coding->decoder = decode_coding_utf_8;
5714 coding->encoder = encode_coding_utf_8;
5715 coding->common_flags
5716 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5717 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5718 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5720 else if (EQ (coding_type, Qutf_16))
5722 val = AREF (attrs, coding_attr_utf_bom);
5723 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5724 : EQ (val, Qt) ? utf_with_bom
5725 : utf_without_bom);
5726 val = AREF (attrs, coding_attr_utf_16_endian);
5727 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5728 : utf_16_little_endian);
5729 CODING_UTF_16_SURROGATE (coding) = 0;
5730 coding->detector = detect_coding_utf_16;
5731 coding->decoder = decode_coding_utf_16;
5732 coding->encoder = encode_coding_utf_16;
5733 coding->common_flags
5734 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5735 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5736 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5738 else if (EQ (coding_type, Qccl))
5740 coding->detector = detect_coding_ccl;
5741 coding->decoder = decode_coding_ccl;
5742 coding->encoder = encode_coding_ccl;
5743 coding->common_flags
5744 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5745 | CODING_REQUIRE_FLUSHING_MASK);
5747 else if (EQ (coding_type, Qemacs_mule))
5749 coding->detector = detect_coding_emacs_mule;
5750 coding->decoder = decode_coding_emacs_mule;
5751 coding->encoder = encode_coding_emacs_mule;
5752 coding->common_flags
5753 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5754 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5755 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5757 Lisp_Object tail, safe_charsets;
5758 int max_charset_id = 0;
5760 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5761 tail = XCDR (tail))
5762 if (max_charset_id < XFASTINT (XCAR (tail)))
5763 max_charset_id = XFASTINT (XCAR (tail));
5764 safe_charsets = make_uninit_string (max_charset_id + 1);
5765 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5766 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5767 tail = XCDR (tail))
5768 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5769 coding->max_charset_id = max_charset_id;
5770 coding->safe_charsets = SDATA (safe_charsets);
5772 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5773 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5775 else if (EQ (coding_type, Qshift_jis))
5777 coding->detector = detect_coding_sjis;
5778 coding->decoder = decode_coding_sjis;
5779 coding->encoder = encode_coding_sjis;
5780 coding->common_flags
5781 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783 else if (EQ (coding_type, Qbig5))
5785 coding->detector = detect_coding_big5;
5786 coding->decoder = decode_coding_big5;
5787 coding->encoder = encode_coding_big5;
5788 coding->common_flags
5789 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5791 else /* EQ (coding_type, Qraw_text) */
5793 coding->detector = NULL;
5794 coding->decoder = decode_coding_raw_text;
5795 coding->encoder = encode_coding_raw_text;
5796 if (! EQ (eol_type, Qunix))
5798 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5799 if (! VECTORP (eol_type))
5800 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5805 return;
5808 /* Return a list of charsets supported by CODING. */
5810 Lisp_Object
5811 coding_charset_list (struct coding_system *coding)
5813 Lisp_Object attrs, charset_list;
5815 CODING_GET_INFO (coding, attrs, charset_list);
5816 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5818 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5820 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5821 charset_list = Viso_2022_charset_list;
5823 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5825 charset_list = Vemacs_mule_charset_list;
5827 return charset_list;
5831 /* Return a list of charsets supported by CODING-SYSTEM. */
5833 Lisp_Object
5834 coding_system_charset_list (Lisp_Object coding_system)
5836 ptrdiff_t id;
5837 Lisp_Object attrs, charset_list;
5839 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5840 attrs = CODING_ID_ATTRS (id);
5842 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5844 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5846 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847 charset_list = Viso_2022_charset_list;
5848 else
5849 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5851 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5853 charset_list = Vemacs_mule_charset_list;
5855 else
5857 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5859 return charset_list;
5863 /* Return raw-text or one of its subsidiaries that has the same
5864 eol_type as CODING-SYSTEM. */
5866 Lisp_Object
5867 raw_text_coding_system (Lisp_Object coding_system)
5869 Lisp_Object spec, attrs;
5870 Lisp_Object eol_type, raw_text_eol_type;
5872 if (NILP (coding_system))
5873 return Qraw_text;
5874 spec = CODING_SYSTEM_SPEC (coding_system);
5875 attrs = AREF (spec, 0);
5877 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5878 return coding_system;
5880 eol_type = AREF (spec, 2);
5881 if (VECTORP (eol_type))
5882 return Qraw_text;
5883 spec = CODING_SYSTEM_SPEC (Qraw_text);
5884 raw_text_eol_type = AREF (spec, 2);
5885 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5886 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5887 : AREF (raw_text_eol_type, 2));
5891 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5892 the subsidiary that has the same eol-spec as PARENT (if it is not
5893 nil and specifies end-of-line format) or the system's setting
5894 (system_eol_type). */
5896 Lisp_Object
5897 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5899 Lisp_Object spec, eol_type;
5901 if (NILP (coding_system))
5902 coding_system = Qraw_text;
5903 spec = CODING_SYSTEM_SPEC (coding_system);
5904 eol_type = AREF (spec, 2);
5905 if (VECTORP (eol_type))
5907 Lisp_Object parent_eol_type;
5909 if (! NILP (parent))
5911 Lisp_Object parent_spec;
5913 parent_spec = CODING_SYSTEM_SPEC (parent);
5914 parent_eol_type = AREF (parent_spec, 2);
5915 if (VECTORP (parent_eol_type))
5916 parent_eol_type = system_eol_type;
5918 else
5919 parent_eol_type = system_eol_type;
5920 if (EQ (parent_eol_type, Qunix))
5921 coding_system = AREF (eol_type, 0);
5922 else if (EQ (parent_eol_type, Qdos))
5923 coding_system = AREF (eol_type, 1);
5924 else if (EQ (parent_eol_type, Qmac))
5925 coding_system = AREF (eol_type, 2);
5927 return coding_system;
5931 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5932 decided for writing to a process. If not, complement them, and
5933 return a new coding system. */
5935 Lisp_Object
5936 complement_process_encoding_system (Lisp_Object coding_system)
5938 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5939 Lisp_Object spec, attrs;
5940 int i;
5942 for (i = 0; i < 3; i++)
5944 if (i == 1)
5945 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5946 else if (i == 2)
5947 coding_system = preferred_coding_system ();
5948 spec = CODING_SYSTEM_SPEC (coding_system);
5949 if (NILP (spec))
5950 continue;
5951 attrs = AREF (spec, 0);
5952 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5953 coding_base = CODING_ATTR_BASE_NAME (attrs);
5954 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5955 eol_base = coding_system;
5956 if (! NILP (coding_base) && ! NILP (eol_base))
5957 break;
5960 if (i > 0)
5961 /* The original CODING_SYSTEM didn't specify text-conversion or
5962 eol-conversion. Be sure that we return a fully complemented
5963 coding system. */
5964 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5965 return coding_system;
5969 /* Emacs has a mechanism to automatically detect a coding system if it
5970 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5971 it's impossible to distinguish some coding systems accurately
5972 because they use the same range of codes. So, at first, coding
5973 systems are categorized into 7, those are:
5975 o coding-category-emacs-mule
5977 The category for a coding system which has the same code range
5978 as Emacs' internal format. Assigned the coding-system (Lisp
5979 symbol) `emacs-mule' by default.
5981 o coding-category-sjis
5983 The category for a coding system which has the same code range
5984 as SJIS. Assigned the coding-system (Lisp
5985 symbol) `japanese-shift-jis' by default.
5987 o coding-category-iso-7
5989 The category for a coding system which has the same code range
5990 as ISO2022 of 7-bit environment. This doesn't use any locking
5991 shift and single shift functions. This can encode/decode all
5992 charsets. Assigned the coding-system (Lisp symbol)
5993 `iso-2022-7bit' by default.
5995 o coding-category-iso-7-tight
5997 Same as coding-category-iso-7 except that this can
5998 encode/decode only the specified charsets.
6000 o coding-category-iso-8-1
6002 The category for a coding system which has the same code range
6003 as ISO2022 of 8-bit environment and graphic plane 1 used only
6004 for DIMENSION1 charset. This doesn't use any locking shift
6005 and single shift functions. Assigned the coding-system (Lisp
6006 symbol) `iso-latin-1' by default.
6008 o coding-category-iso-8-2
6010 The category for a coding system which has the same code range
6011 as ISO2022 of 8-bit environment and graphic plane 1 used only
6012 for DIMENSION2 charset. This doesn't use any locking shift
6013 and single shift functions. Assigned the coding-system (Lisp
6014 symbol) `japanese-iso-8bit' by default.
6016 o coding-category-iso-7-else
6018 The category for a coding system which has the same code range
6019 as ISO2022 of 7-bit environment but uses locking shift or
6020 single shift functions. Assigned the coding-system (Lisp
6021 symbol) `iso-2022-7bit-lock' by default.
6023 o coding-category-iso-8-else
6025 The category for a coding system which has the same code range
6026 as ISO2022 of 8-bit environment but uses locking shift or
6027 single shift functions. Assigned the coding-system (Lisp
6028 symbol) `iso-2022-8bit-ss2' by default.
6030 o coding-category-big5
6032 The category for a coding system which has the same code range
6033 as BIG5. Assigned the coding-system (Lisp symbol)
6034 `cn-big5' by default.
6036 o coding-category-utf-8
6038 The category for a coding system which has the same code range
6039 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6040 symbol) `utf-8' by default.
6042 o coding-category-utf-16-be
6044 The category for a coding system in which a text has an
6045 Unicode signature (cf. Unicode Standard) in the order of BIG
6046 endian at the head. Assigned the coding-system (Lisp symbol)
6047 `utf-16-be' by default.
6049 o coding-category-utf-16-le
6051 The category for a coding system in which a text has an
6052 Unicode signature (cf. Unicode Standard) in the order of
6053 LITTLE endian at the head. Assigned the coding-system (Lisp
6054 symbol) `utf-16-le' by default.
6056 o coding-category-ccl
6058 The category for a coding system of which encoder/decoder is
6059 written in CCL programs. The default value is nil, i.e., no
6060 coding system is assigned.
6062 o coding-category-binary
6064 The category for a coding system not categorized in any of the
6065 above. Assigned the coding-system (Lisp symbol)
6066 `no-conversion' by default.
6068 Each of them is a Lisp symbol and the value is an actual
6069 `coding-system's (this is also a Lisp symbol) assigned by a user.
6070 What Emacs does actually is to detect a category of coding system.
6071 Then, it uses a `coding-system' assigned to it. If Emacs can't
6072 decide only one possible category, it selects a category of the
6073 highest priority. Priorities of categories are also specified by a
6074 user in a Lisp variable `coding-category-list'.
6078 #define EOL_SEEN_NONE 0
6079 #define EOL_SEEN_LF 1
6080 #define EOL_SEEN_CR 2
6081 #define EOL_SEEN_CRLF 4
6083 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6084 SOURCE is encoded. If CATEGORY is one of
6085 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6086 two-byte, else they are encoded by one-byte.
6088 Return one of EOL_SEEN_XXX. */
6090 #define MAX_EOL_CHECK_COUNT 3
6092 static int
6093 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6094 enum coding_category category)
6096 const unsigned char *src = source, *src_end = src + src_bytes;
6097 unsigned char c;
6098 int total = 0;
6099 int eol_seen = EOL_SEEN_NONE;
6101 if ((1 << category) & CATEGORY_MASK_UTF_16)
6103 bool msb = category == (coding_category_utf_16_le
6104 | coding_category_utf_16_le_nosig);
6105 bool lsb = !msb;
6107 while (src + 1 < src_end)
6109 c = src[lsb];
6110 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6112 int this_eol;
6114 if (c == '\n')
6115 this_eol = EOL_SEEN_LF;
6116 else if (src + 3 >= src_end
6117 || src[msb + 2] != 0
6118 || src[lsb + 2] != '\n')
6119 this_eol = EOL_SEEN_CR;
6120 else
6122 this_eol = EOL_SEEN_CRLF;
6123 src += 2;
6126 if (eol_seen == EOL_SEEN_NONE)
6127 /* This is the first end-of-line. */
6128 eol_seen = this_eol;
6129 else if (eol_seen != this_eol)
6131 /* The found type is different from what found before.
6132 Allow for stray ^M characters in DOS EOL files. */
6133 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6134 || (eol_seen == EOL_SEEN_CRLF
6135 && this_eol == EOL_SEEN_CR))
6136 eol_seen = EOL_SEEN_CRLF;
6137 else
6139 eol_seen = EOL_SEEN_LF;
6140 break;
6143 if (++total == MAX_EOL_CHECK_COUNT)
6144 break;
6146 src += 2;
6149 else
6150 while (src < src_end)
6152 c = *src++;
6153 if (c == '\n' || c == '\r')
6155 int this_eol;
6157 if (c == '\n')
6158 this_eol = EOL_SEEN_LF;
6159 else if (src >= src_end || *src != '\n')
6160 this_eol = EOL_SEEN_CR;
6161 else
6162 this_eol = EOL_SEEN_CRLF, src++;
6164 if (eol_seen == EOL_SEEN_NONE)
6165 /* This is the first end-of-line. */
6166 eol_seen = this_eol;
6167 else if (eol_seen != this_eol)
6169 /* The found type is different from what found before.
6170 Allow for stray ^M characters in DOS EOL files. */
6171 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6172 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6173 eol_seen = EOL_SEEN_CRLF;
6174 else
6176 eol_seen = EOL_SEEN_LF;
6177 break;
6180 if (++total == MAX_EOL_CHECK_COUNT)
6181 break;
6184 return eol_seen;
6188 static Lisp_Object
6189 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6191 Lisp_Object eol_type;
6193 eol_type = CODING_ID_EOL_TYPE (coding->id);
6194 if (eol_seen & EOL_SEEN_LF)
6196 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6197 eol_type = Qunix;
6199 else if (eol_seen & EOL_SEEN_CRLF)
6201 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6202 eol_type = Qdos;
6204 else if (eol_seen & EOL_SEEN_CR)
6206 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6207 eol_type = Qmac;
6209 return eol_type;
6212 /* Detect how a text specified in CODING is encoded. If a coding
6213 system is detected, update fields of CODING by the detected coding
6214 system. */
6216 static void
6217 detect_coding (struct coding_system *coding)
6219 const unsigned char *src, *src_end;
6220 unsigned int saved_mode = coding->mode;
6222 coding->consumed = coding->consumed_char = 0;
6223 coding->produced = coding->produced_char = 0;
6224 coding_set_source (coding);
6226 src_end = coding->source + coding->src_bytes;
6227 coding->head_ascii = 0;
6229 /* If we have not yet decided the text encoding type, detect it
6230 now. */
6231 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6233 int c, i;
6234 struct coding_detection_info detect_info;
6235 bool null_byte_found = 0, eight_bit_found = 0;
6237 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6238 for (src = coding->source; src < src_end; src++)
6240 c = *src;
6241 if (c & 0x80)
6243 eight_bit_found = 1;
6244 if (null_byte_found)
6245 break;
6247 else if (c < 0x20)
6249 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6250 && ! inhibit_iso_escape_detection
6251 && ! detect_info.checked)
6253 if (detect_coding_iso_2022 (coding, &detect_info))
6255 /* We have scanned the whole data. */
6256 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6258 /* We didn't find an 8-bit code. We may
6259 have found a null-byte, but it's very
6260 rare that a binary file conforms to
6261 ISO-2022. */
6262 src = src_end;
6263 coding->head_ascii = src - coding->source;
6265 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6266 break;
6269 else if (! c && !inhibit_null_byte_detection)
6271 null_byte_found = 1;
6272 if (eight_bit_found)
6273 break;
6275 if (! eight_bit_found)
6276 coding->head_ascii++;
6278 else if (! eight_bit_found)
6279 coding->head_ascii++;
6282 if (null_byte_found || eight_bit_found
6283 || coding->head_ascii < coding->src_bytes
6284 || detect_info.found)
6286 enum coding_category category;
6287 struct coding_system *this;
6289 if (coding->head_ascii == coding->src_bytes)
6290 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6291 for (i = 0; i < coding_category_raw_text; i++)
6293 category = coding_priorities[i];
6294 this = coding_categories + category;
6295 if (detect_info.found & (1 << category))
6296 break;
6298 else
6300 if (null_byte_found)
6302 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6303 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6305 for (i = 0; i < coding_category_raw_text; i++)
6307 category = coding_priorities[i];
6308 this = coding_categories + category;
6309 /* Some of this->detector (e.g. detect_coding_sjis)
6310 require this information. */
6311 coding->id = this->id;
6312 if (this->id < 0)
6314 /* No coding system of this category is defined. */
6315 detect_info.rejected |= (1 << category);
6317 else if (category >= coding_category_raw_text)
6318 continue;
6319 else if (detect_info.checked & (1 << category))
6321 if (detect_info.found & (1 << category))
6322 break;
6324 else if ((*(this->detector)) (coding, &detect_info)
6325 && detect_info.found & (1 << category))
6327 if (category == coding_category_utf_16_auto)
6329 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6330 category = coding_category_utf_16_le;
6331 else
6332 category = coding_category_utf_16_be;
6334 break;
6339 if (i < coding_category_raw_text)
6340 setup_coding_system (CODING_ID_NAME (this->id), coding);
6341 else if (null_byte_found)
6342 setup_coding_system (Qno_conversion, coding);
6343 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6344 == CATEGORY_MASK_ANY)
6345 setup_coding_system (Qraw_text, coding);
6346 else if (detect_info.rejected)
6347 for (i = 0; i < coding_category_raw_text; i++)
6348 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6350 this = coding_categories + coding_priorities[i];
6351 setup_coding_system (CODING_ID_NAME (this->id), coding);
6352 break;
6356 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6357 == coding_category_utf_8_auto)
6359 Lisp_Object coding_systems;
6360 struct coding_detection_info detect_info;
6362 coding_systems
6363 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6364 detect_info.found = detect_info.rejected = 0;
6365 coding->head_ascii = 0;
6366 if (CONSP (coding_systems)
6367 && detect_coding_utf_8 (coding, &detect_info))
6369 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6370 setup_coding_system (XCAR (coding_systems), coding);
6371 else
6372 setup_coding_system (XCDR (coding_systems), coding);
6375 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6376 == coding_category_utf_16_auto)
6378 Lisp_Object coding_systems;
6379 struct coding_detection_info detect_info;
6381 coding_systems
6382 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6383 detect_info.found = detect_info.rejected = 0;
6384 coding->head_ascii = 0;
6385 if (CONSP (coding_systems)
6386 && detect_coding_utf_16 (coding, &detect_info))
6388 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6389 setup_coding_system (XCAR (coding_systems), coding);
6390 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6391 setup_coding_system (XCDR (coding_systems), coding);
6394 coding->mode = saved_mode;
6398 static void
6399 decode_eol (struct coding_system *coding)
6401 Lisp_Object eol_type;
6402 unsigned char *p, *pbeg, *pend;
6404 eol_type = CODING_ID_EOL_TYPE (coding->id);
6405 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6406 return;
6408 if (NILP (coding->dst_object))
6409 pbeg = coding->destination;
6410 else
6411 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6412 pend = pbeg + coding->produced;
6414 if (VECTORP (eol_type))
6416 int eol_seen = EOL_SEEN_NONE;
6418 for (p = pbeg; p < pend; p++)
6420 if (*p == '\n')
6421 eol_seen |= EOL_SEEN_LF;
6422 else if (*p == '\r')
6424 if (p + 1 < pend && *(p + 1) == '\n')
6426 eol_seen |= EOL_SEEN_CRLF;
6427 p++;
6429 else
6430 eol_seen |= EOL_SEEN_CR;
6433 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6434 if ((eol_seen & EOL_SEEN_CRLF) != 0
6435 && (eol_seen & EOL_SEEN_CR) != 0
6436 && (eol_seen & EOL_SEEN_LF) == 0)
6437 eol_seen = EOL_SEEN_CRLF;
6438 else if (eol_seen != EOL_SEEN_NONE
6439 && eol_seen != EOL_SEEN_LF
6440 && eol_seen != EOL_SEEN_CRLF
6441 && eol_seen != EOL_SEEN_CR)
6442 eol_seen = EOL_SEEN_LF;
6443 if (eol_seen != EOL_SEEN_NONE)
6444 eol_type = adjust_coding_eol_type (coding, eol_seen);
6447 if (EQ (eol_type, Qmac))
6449 for (p = pbeg; p < pend; p++)
6450 if (*p == '\r')
6451 *p = '\n';
6453 else if (EQ (eol_type, Qdos))
6455 ptrdiff_t n = 0;
6457 if (NILP (coding->dst_object))
6459 /* Start deleting '\r' from the tail to minimize the memory
6460 movement. */
6461 for (p = pend - 2; p >= pbeg; p--)
6462 if (*p == '\r')
6464 memmove (p, p + 1, pend-- - p - 1);
6465 n++;
6468 else
6470 ptrdiff_t pos_byte = coding->dst_pos_byte;
6471 ptrdiff_t pos = coding->dst_pos;
6472 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6474 while (pos < pos_end)
6476 p = BYTE_POS_ADDR (pos_byte);
6477 if (*p == '\r' && p[1] == '\n')
6479 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6480 n++;
6481 pos_end--;
6483 pos++;
6484 if (coding->dst_multibyte)
6485 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6486 else
6487 pos_byte++;
6490 coding->produced -= n;
6491 coding->produced_char -= n;
6496 /* Return a translation table (or list of them) from coding system
6497 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6498 not ENCODEP). */
6500 static Lisp_Object
6501 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6503 Lisp_Object standard, translation_table;
6504 Lisp_Object val;
6506 if (NILP (Venable_character_translation))
6508 if (max_lookup)
6509 *max_lookup = 0;
6510 return Qnil;
6512 if (encodep)
6513 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6514 standard = Vstandard_translation_table_for_encode;
6515 else
6516 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6517 standard = Vstandard_translation_table_for_decode;
6518 if (NILP (translation_table))
6519 translation_table = standard;
6520 else
6522 if (SYMBOLP (translation_table))
6523 translation_table = Fget (translation_table, Qtranslation_table);
6524 else if (CONSP (translation_table))
6526 translation_table = Fcopy_sequence (translation_table);
6527 for (val = translation_table; CONSP (val); val = XCDR (val))
6528 if (SYMBOLP (XCAR (val)))
6529 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6531 if (CHAR_TABLE_P (standard))
6533 if (CONSP (translation_table))
6534 translation_table = nconc2 (translation_table,
6535 Fcons (standard, Qnil));
6536 else
6537 translation_table = Fcons (translation_table,
6538 Fcons (standard, Qnil));
6542 if (max_lookup)
6544 *max_lookup = 1;
6545 if (CHAR_TABLE_P (translation_table)
6546 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6548 val = XCHAR_TABLE (translation_table)->extras[1];
6549 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6550 *max_lookup = XFASTINT (val);
6552 else if (CONSP (translation_table))
6554 Lisp_Object tail;
6556 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6557 if (CHAR_TABLE_P (XCAR (tail))
6558 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6560 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6561 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6562 *max_lookup = XFASTINT (tailval);
6566 return translation_table;
6569 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6570 do { \
6571 trans = Qnil; \
6572 if (CHAR_TABLE_P (table)) \
6574 trans = CHAR_TABLE_REF (table, c); \
6575 if (CHARACTERP (trans)) \
6576 c = XFASTINT (trans), trans = Qnil; \
6578 else if (CONSP (table)) \
6580 Lisp_Object tail; \
6582 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6583 if (CHAR_TABLE_P (XCAR (tail))) \
6585 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6586 if (CHARACTERP (trans)) \
6587 c = XFASTINT (trans), trans = Qnil; \
6588 else if (! NILP (trans)) \
6589 break; \
6592 } while (0)
6595 /* Return a translation of character(s) at BUF according to TRANS.
6596 TRANS is TO-CHAR or ((FROM . TO) ...) where
6597 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6598 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6599 translation is found, and Qnil if not found..
6600 If BUF is too short to lookup characters in FROM, return Qt. */
6602 static Lisp_Object
6603 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6606 if (INTEGERP (trans))
6607 return trans;
6608 for (; CONSP (trans); trans = XCDR (trans))
6610 Lisp_Object val = XCAR (trans);
6611 Lisp_Object from = XCAR (val);
6612 ptrdiff_t len = ASIZE (from);
6613 ptrdiff_t i;
6615 for (i = 0; i < len; i++)
6617 if (buf + i == buf_end)
6618 return Qt;
6619 if (XINT (AREF (from, i)) != buf[i])
6620 break;
6622 if (i == len)
6623 return val;
6625 return Qnil;
6629 static int
6630 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6631 bool last_block)
6633 unsigned char *dst = coding->destination + coding->produced;
6634 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6635 ptrdiff_t produced;
6636 ptrdiff_t produced_chars = 0;
6637 int carryover = 0;
6639 if (! coding->chars_at_source)
6641 /* Source characters are in coding->charbuf. */
6642 int *buf = coding->charbuf;
6643 int *buf_end = buf + coding->charbuf_used;
6645 if (EQ (coding->src_object, coding->dst_object))
6647 coding_set_source (coding);
6648 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6651 while (buf < buf_end)
6653 int c = *buf;
6654 ptrdiff_t i;
6656 if (c >= 0)
6658 ptrdiff_t from_nchars = 1, to_nchars = 1;
6659 Lisp_Object trans = Qnil;
6661 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6662 if (! NILP (trans))
6664 trans = get_translation (trans, buf, buf_end);
6665 if (INTEGERP (trans))
6666 c = XINT (trans);
6667 else if (CONSP (trans))
6669 from_nchars = ASIZE (XCAR (trans));
6670 trans = XCDR (trans);
6671 if (INTEGERP (trans))
6672 c = XINT (trans);
6673 else
6675 to_nchars = ASIZE (trans);
6676 c = XINT (AREF (trans, 0));
6679 else if (EQ (trans, Qt) && ! last_block)
6680 break;
6683 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6685 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6686 / MAX_MULTIBYTE_LENGTH)
6687 < to_nchars)
6688 memory_full (SIZE_MAX);
6689 dst = alloc_destination (coding,
6690 buf_end - buf
6691 + MAX_MULTIBYTE_LENGTH * to_nchars,
6692 dst);
6693 if (EQ (coding->src_object, coding->dst_object))
6695 coding_set_source (coding);
6696 dst_end = (((unsigned char *) coding->source)
6697 + coding->consumed);
6699 else
6700 dst_end = coding->destination + coding->dst_bytes;
6703 for (i = 0; i < to_nchars; i++)
6705 if (i > 0)
6706 c = XINT (AREF (trans, i));
6707 if (coding->dst_multibyte
6708 || ! CHAR_BYTE8_P (c))
6709 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6710 else
6711 *dst++ = CHAR_TO_BYTE8 (c);
6713 produced_chars += to_nchars;
6714 buf += from_nchars;
6716 else
6717 /* This is an annotation datum. (-C) is the length. */
6718 buf += -c;
6720 carryover = buf_end - buf;
6722 else
6724 /* Source characters are at coding->source. */
6725 const unsigned char *src = coding->source;
6726 const unsigned char *src_end = src + coding->consumed;
6728 if (EQ (coding->dst_object, coding->src_object))
6729 dst_end = (unsigned char *) src;
6730 if (coding->src_multibyte != coding->dst_multibyte)
6732 if (coding->src_multibyte)
6734 bool multibytep = 1;
6735 ptrdiff_t consumed_chars = 0;
6737 while (1)
6739 const unsigned char *src_base = src;
6740 int c;
6742 ONE_MORE_BYTE (c);
6743 if (dst == dst_end)
6745 if (EQ (coding->src_object, coding->dst_object))
6746 dst_end = (unsigned char *) src;
6747 if (dst == dst_end)
6749 ptrdiff_t offset = src - coding->source;
6751 dst = alloc_destination (coding, src_end - src + 1,
6752 dst);
6753 dst_end = coding->destination + coding->dst_bytes;
6754 coding_set_source (coding);
6755 src = coding->source + offset;
6756 src_end = coding->source + coding->consumed;
6757 if (EQ (coding->src_object, coding->dst_object))
6758 dst_end = (unsigned char *) src;
6761 *dst++ = c;
6762 produced_chars++;
6764 no_more_source:
6767 else
6768 while (src < src_end)
6770 bool multibytep = 1;
6771 int c = *src++;
6773 if (dst >= dst_end - 1)
6775 if (EQ (coding->src_object, coding->dst_object))
6776 dst_end = (unsigned char *) src;
6777 if (dst >= dst_end - 1)
6779 ptrdiff_t offset = src - coding->source;
6780 ptrdiff_t more_bytes;
6782 if (EQ (coding->src_object, coding->dst_object))
6783 more_bytes = ((src_end - src) / 2) + 2;
6784 else
6785 more_bytes = src_end - src + 2;
6786 dst = alloc_destination (coding, more_bytes, dst);
6787 dst_end = coding->destination + coding->dst_bytes;
6788 coding_set_source (coding);
6789 src = coding->source + offset;
6790 src_end = coding->source + coding->consumed;
6791 if (EQ (coding->src_object, coding->dst_object))
6792 dst_end = (unsigned char *) src;
6795 EMIT_ONE_BYTE (c);
6798 else
6800 if (!EQ (coding->src_object, coding->dst_object))
6802 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6804 if (require > 0)
6806 ptrdiff_t offset = src - coding->source;
6808 dst = alloc_destination (coding, require, dst);
6809 coding_set_source (coding);
6810 src = coding->source + offset;
6811 src_end = coding->source + coding->consumed;
6814 produced_chars = coding->consumed_char;
6815 while (src < src_end)
6816 *dst++ = *src++;
6820 produced = dst - (coding->destination + coding->produced);
6821 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6822 insert_from_gap (produced_chars, produced);
6823 coding->produced += produced;
6824 coding->produced_char += produced_chars;
6825 return carryover;
6828 /* Compose text in CODING->object according to the annotation data at
6829 CHARBUF. CHARBUF is an array:
6830 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6833 static void
6834 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6836 int len;
6837 ptrdiff_t to;
6838 enum composition_method method;
6839 Lisp_Object components;
6841 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6842 to = pos + charbuf[2];
6843 method = (enum composition_method) (charbuf[4]);
6845 if (method == COMPOSITION_RELATIVE)
6846 components = Qnil;
6847 else
6849 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6850 int i, j;
6852 if (method == COMPOSITION_WITH_RULE)
6853 len = charbuf[2] * 3 - 2;
6854 charbuf += MAX_ANNOTATION_LENGTH;
6855 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6856 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6858 if (charbuf[i] >= 0)
6859 args[j] = make_number (charbuf[i]);
6860 else
6862 i++;
6863 args[j] = make_number (charbuf[i] % 0x100);
6866 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6868 compose_text (pos, to, components, Qnil, coding->dst_object);
6872 /* Put `charset' property on text in CODING->object according to
6873 the annotation data at CHARBUF. CHARBUF is an array:
6874 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6877 static void
6878 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6880 ptrdiff_t from = pos - charbuf[2];
6881 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6883 Fput_text_property (make_number (from), make_number (pos),
6884 Qcharset, CHARSET_NAME (charset),
6885 coding->dst_object);
6889 #define CHARBUF_SIZE 0x4000
6891 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6892 do { \
6893 int size = CHARBUF_SIZE; \
6895 coding->charbuf = NULL; \
6896 while (size > 1024) \
6898 coding->charbuf = alloca (sizeof (int) * size); \
6899 if (coding->charbuf) \
6900 break; \
6901 size >>= 1; \
6903 if (! coding->charbuf) \
6905 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6906 return; \
6908 coding->charbuf_size = size; \
6909 } while (0)
6912 static void
6913 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6915 int *charbuf = coding->charbuf;
6916 int *charbuf_end = charbuf + coding->charbuf_used;
6918 if (NILP (coding->dst_object))
6919 return;
6921 while (charbuf < charbuf_end)
6923 if (*charbuf >= 0)
6924 pos++, charbuf++;
6925 else
6927 int len = -*charbuf;
6929 if (len > 2)
6930 switch (charbuf[1])
6932 case CODING_ANNOTATE_COMPOSITION_MASK:
6933 produce_composition (coding, charbuf, pos);
6934 break;
6935 case CODING_ANNOTATE_CHARSET_MASK:
6936 produce_charset (coding, charbuf, pos);
6937 break;
6939 charbuf += len;
6944 /* Decode the data at CODING->src_object into CODING->dst_object.
6945 CODING->src_object is a buffer, a string, or nil.
6946 CODING->dst_object is a buffer.
6948 If CODING->src_object is a buffer, it must be the current buffer.
6949 In this case, if CODING->src_pos is positive, it is a position of
6950 the source text in the buffer, otherwise, the source text is in the
6951 gap area of the buffer, and CODING->src_pos specifies the offset of
6952 the text from GPT (which must be the same as PT). If this is the
6953 same buffer as CODING->dst_object, CODING->src_pos must be
6954 negative.
6956 If CODING->src_object is a string, CODING->src_pos is an index to
6957 that string.
6959 If CODING->src_object is nil, CODING->source must already point to
6960 the non-relocatable memory area. In this case, CODING->src_pos is
6961 an offset from CODING->source.
6963 The decoded data is inserted at the current point of the buffer
6964 CODING->dst_object.
6967 static void
6968 decode_coding (struct coding_system *coding)
6970 Lisp_Object attrs;
6971 Lisp_Object undo_list;
6972 Lisp_Object translation_table;
6973 struct ccl_spec cclspec;
6974 int carryover;
6975 int i;
6977 if (BUFFERP (coding->src_object)
6978 && coding->src_pos > 0
6979 && coding->src_pos < GPT
6980 && coding->src_pos + coding->src_chars > GPT)
6981 move_gap_both (coding->src_pos, coding->src_pos_byte);
6983 undo_list = Qt;
6984 if (BUFFERP (coding->dst_object))
6986 set_buffer_internal (XBUFFER (coding->dst_object));
6987 if (GPT != PT)
6988 move_gap_both (PT, PT_BYTE);
6990 /* We must disable undo_list in order to record the whole insert
6991 transaction via record_insert at the end. But doing so also
6992 disables the recording of the first change to the undo_list.
6993 Therefore we check for first change here and record it via
6994 record_first_change if needed. */
6995 if (MODIFF <= SAVE_MODIFF)
6996 record_first_change ();
6998 undo_list = BVAR (current_buffer, undo_list);
6999 bset_undo_list (current_buffer, Qt);
7002 coding->consumed = coding->consumed_char = 0;
7003 coding->produced = coding->produced_char = 0;
7004 coding->chars_at_source = 0;
7005 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7006 coding->errors = 0;
7008 ALLOC_CONVERSION_WORK_AREA (coding);
7010 attrs = CODING_ID_ATTRS (coding->id);
7011 translation_table = get_translation_table (attrs, 0, NULL);
7013 carryover = 0;
7014 if (coding->decoder == decode_coding_ccl)
7016 coding->spec.ccl = &cclspec;
7017 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7021 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7023 coding_set_source (coding);
7024 coding->annotated = 0;
7025 coding->charbuf_used = carryover;
7026 (*(coding->decoder)) (coding);
7027 coding_set_destination (coding);
7028 carryover = produce_chars (coding, translation_table, 0);
7029 if (coding->annotated)
7030 produce_annotation (coding, pos);
7031 for (i = 0; i < carryover; i++)
7032 coding->charbuf[i]
7033 = coding->charbuf[coding->charbuf_used - carryover + i];
7035 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7036 || (coding->consumed < coding->src_bytes
7037 && (coding->result == CODING_RESULT_SUCCESS
7038 || coding->result == CODING_RESULT_INVALID_SRC)));
7040 if (carryover > 0)
7042 coding_set_destination (coding);
7043 coding->charbuf_used = carryover;
7044 produce_chars (coding, translation_table, 1);
7047 coding->carryover_bytes = 0;
7048 if (coding->consumed < coding->src_bytes)
7050 int nbytes = coding->src_bytes - coding->consumed;
7051 const unsigned char *src;
7053 coding_set_source (coding);
7054 coding_set_destination (coding);
7055 src = coding->source + coding->consumed;
7057 if (coding->mode & CODING_MODE_LAST_BLOCK)
7059 /* Flush out unprocessed data as binary chars. We are sure
7060 that the number of data is less than the size of
7061 coding->charbuf. */
7062 coding->charbuf_used = 0;
7063 coding->chars_at_source = 0;
7065 while (nbytes-- > 0)
7067 int c = *src++;
7069 if (c & 0x80)
7070 c = BYTE8_TO_CHAR (c);
7071 coding->charbuf[coding->charbuf_used++] = c;
7073 produce_chars (coding, Qnil, 1);
7075 else
7077 /* Record unprocessed bytes in coding->carryover. We are
7078 sure that the number of data is less than the size of
7079 coding->carryover. */
7080 unsigned char *p = coding->carryover;
7082 if (nbytes > sizeof coding->carryover)
7083 nbytes = sizeof coding->carryover;
7084 coding->carryover_bytes = nbytes;
7085 while (nbytes-- > 0)
7086 *p++ = *src++;
7088 coding->consumed = coding->src_bytes;
7091 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7092 && !inhibit_eol_conversion)
7093 decode_eol (coding);
7094 if (BUFFERP (coding->dst_object))
7096 bset_undo_list (current_buffer, undo_list);
7097 record_insert (coding->dst_pos, coding->produced_char);
7102 /* Extract an annotation datum from a composition starting at POS and
7103 ending before LIMIT of CODING->src_object (buffer or string), store
7104 the data in BUF, set *STOP to a starting position of the next
7105 composition (if any) or to LIMIT, and return the address of the
7106 next element of BUF.
7108 If such an annotation is not found, set *STOP to a starting
7109 position of a composition after POS (if any) or to LIMIT, and
7110 return BUF. */
7112 static int *
7113 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7114 struct coding_system *coding, int *buf,
7115 ptrdiff_t *stop)
7117 ptrdiff_t start, end;
7118 Lisp_Object prop;
7120 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7121 || end > limit)
7122 *stop = limit;
7123 else if (start > pos)
7124 *stop = start;
7125 else
7127 if (start == pos)
7129 /* We found a composition. Store the corresponding
7130 annotation data in BUF. */
7131 int *head = buf;
7132 enum composition_method method = COMPOSITION_METHOD (prop);
7133 int nchars = COMPOSITION_LENGTH (prop);
7135 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7136 if (method != COMPOSITION_RELATIVE)
7138 Lisp_Object components;
7139 ptrdiff_t i, len, i_byte;
7141 components = COMPOSITION_COMPONENTS (prop);
7142 if (VECTORP (components))
7144 len = ASIZE (components);
7145 for (i = 0; i < len; i++)
7146 *buf++ = XINT (AREF (components, i));
7148 else if (STRINGP (components))
7150 len = SCHARS (components);
7151 i = i_byte = 0;
7152 while (i < len)
7154 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7155 buf++;
7158 else if (INTEGERP (components))
7160 len = 1;
7161 *buf++ = XINT (components);
7163 else if (CONSP (components))
7165 for (len = 0; CONSP (components);
7166 len++, components = XCDR (components))
7167 *buf++ = XINT (XCAR (components));
7169 else
7170 emacs_abort ();
7171 *head -= len;
7175 if (find_composition (end, limit, &start, &end, &prop,
7176 coding->src_object)
7177 && end <= limit)
7178 *stop = start;
7179 else
7180 *stop = limit;
7182 return buf;
7186 /* Extract an annotation datum from a text property `charset' at POS of
7187 CODING->src_object (buffer of string), store the data in BUF, set
7188 *STOP to the position where the value of `charset' property changes
7189 (limiting by LIMIT), and return the address of the next element of
7190 BUF.
7192 If the property value is nil, set *STOP to the position where the
7193 property value is non-nil (limiting by LIMIT), and return BUF. */
7195 static int *
7196 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7197 struct coding_system *coding, int *buf,
7198 ptrdiff_t *stop)
7200 Lisp_Object val, next;
7201 int id;
7203 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7204 if (! NILP (val) && CHARSETP (val))
7205 id = XINT (CHARSET_SYMBOL_ID (val));
7206 else
7207 id = -1;
7208 ADD_CHARSET_DATA (buf, 0, id);
7209 next = Fnext_single_property_change (make_number (pos), Qcharset,
7210 coding->src_object,
7211 make_number (limit));
7212 *stop = XINT (next);
7213 return buf;
7217 static void
7218 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7219 int max_lookup)
7221 int *buf = coding->charbuf;
7222 int *buf_end = coding->charbuf + coding->charbuf_size;
7223 const unsigned char *src = coding->source + coding->consumed;
7224 const unsigned char *src_end = coding->source + coding->src_bytes;
7225 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7226 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7227 bool multibytep = coding->src_multibyte;
7228 Lisp_Object eol_type;
7229 int c;
7230 ptrdiff_t stop, stop_composition, stop_charset;
7231 int *lookup_buf = NULL;
7233 if (! NILP (translation_table))
7234 lookup_buf = alloca (sizeof (int) * max_lookup);
7236 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7237 if (VECTORP (eol_type))
7238 eol_type = Qunix;
7240 /* Note: composition handling is not yet implemented. */
7241 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7243 if (NILP (coding->src_object))
7244 stop = stop_composition = stop_charset = end_pos;
7245 else
7247 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7248 stop = stop_composition = pos;
7249 else
7250 stop = stop_composition = end_pos;
7251 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7252 stop = stop_charset = pos;
7253 else
7254 stop_charset = end_pos;
7257 /* Compensate for CRLF and conversion. */
7258 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7259 while (buf < buf_end)
7261 Lisp_Object trans;
7263 if (pos == stop)
7265 if (pos == end_pos)
7266 break;
7267 if (pos == stop_composition)
7268 buf = handle_composition_annotation (pos, end_pos, coding,
7269 buf, &stop_composition);
7270 if (pos == stop_charset)
7271 buf = handle_charset_annotation (pos, end_pos, coding,
7272 buf, &stop_charset);
7273 stop = (stop_composition < stop_charset
7274 ? stop_composition : stop_charset);
7277 if (! multibytep)
7279 int bytes;
7281 if (coding->encoder == encode_coding_raw_text
7282 || coding->encoder == encode_coding_ccl)
7283 c = *src++, pos++;
7284 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7285 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7286 else
7287 c = BYTE8_TO_CHAR (*src), src++, pos++;
7289 else
7290 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7291 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7292 c = '\n';
7293 if (! EQ (eol_type, Qunix))
7295 if (c == '\n')
7297 if (EQ (eol_type, Qdos))
7298 *buf++ = '\r';
7299 else
7300 c = '\r';
7304 trans = Qnil;
7305 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7306 if (NILP (trans))
7307 *buf++ = c;
7308 else
7310 ptrdiff_t from_nchars = 1, to_nchars = 1;
7311 int *lookup_buf_end;
7312 const unsigned char *p = src;
7313 int i;
7315 lookup_buf[0] = c;
7316 for (i = 1; i < max_lookup && p < src_end; i++)
7317 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7318 lookup_buf_end = lookup_buf + i;
7319 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7320 if (INTEGERP (trans))
7321 c = XINT (trans);
7322 else if (CONSP (trans))
7324 from_nchars = ASIZE (XCAR (trans));
7325 trans = XCDR (trans);
7326 if (INTEGERP (trans))
7327 c = XINT (trans);
7328 else
7330 to_nchars = ASIZE (trans);
7331 if (buf_end - buf < to_nchars)
7332 break;
7333 c = XINT (AREF (trans, 0));
7336 else
7337 break;
7338 *buf++ = c;
7339 for (i = 1; i < to_nchars; i++)
7340 *buf++ = XINT (AREF (trans, i));
7341 for (i = 1; i < from_nchars; i++, pos++)
7342 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7346 coding->consumed = src - coding->source;
7347 coding->consumed_char = pos - coding->src_pos;
7348 coding->charbuf_used = buf - coding->charbuf;
7349 coding->chars_at_source = 0;
7353 /* Encode the text at CODING->src_object into CODING->dst_object.
7354 CODING->src_object is a buffer or a string.
7355 CODING->dst_object is a buffer or nil.
7357 If CODING->src_object is a buffer, it must be the current buffer.
7358 In this case, if CODING->src_pos is positive, it is a position of
7359 the source text in the buffer, otherwise. the source text is in the
7360 gap area of the buffer, and coding->src_pos specifies the offset of
7361 the text from GPT (which must be the same as PT). If this is the
7362 same buffer as CODING->dst_object, CODING->src_pos must be
7363 negative and CODING should not have `pre-write-conversion'.
7365 If CODING->src_object is a string, CODING should not have
7366 `pre-write-conversion'.
7368 If CODING->dst_object is a buffer, the encoded data is inserted at
7369 the current point of that buffer.
7371 If CODING->dst_object is nil, the encoded data is placed at the
7372 memory area specified by CODING->destination. */
7374 static void
7375 encode_coding (struct coding_system *coding)
7377 Lisp_Object attrs;
7378 Lisp_Object translation_table;
7379 int max_lookup;
7380 struct ccl_spec cclspec;
7382 attrs = CODING_ID_ATTRS (coding->id);
7383 if (coding->encoder == encode_coding_raw_text)
7384 translation_table = Qnil, max_lookup = 0;
7385 else
7386 translation_table = get_translation_table (attrs, 1, &max_lookup);
7388 if (BUFFERP (coding->dst_object))
7390 set_buffer_internal (XBUFFER (coding->dst_object));
7391 coding->dst_multibyte
7392 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7395 coding->consumed = coding->consumed_char = 0;
7396 coding->produced = coding->produced_char = 0;
7397 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7398 coding->errors = 0;
7400 ALLOC_CONVERSION_WORK_AREA (coding);
7402 if (coding->encoder == encode_coding_ccl)
7404 coding->spec.ccl = &cclspec;
7405 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7407 do {
7408 coding_set_source (coding);
7409 consume_chars (coding, translation_table, max_lookup);
7410 coding_set_destination (coding);
7411 (*(coding->encoder)) (coding);
7412 } while (coding->consumed_char < coding->src_chars);
7414 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7415 insert_from_gap (coding->produced_char, coding->produced);
7419 /* Name (or base name) of work buffer for code conversion. */
7420 static Lisp_Object Vcode_conversion_workbuf_name;
7422 /* A working buffer used by the top level conversion. Once it is
7423 created, it is never destroyed. It has the name
7424 Vcode_conversion_workbuf_name. The other working buffers are
7425 destroyed after the use is finished, and their names are modified
7426 versions of Vcode_conversion_workbuf_name. */
7427 static Lisp_Object Vcode_conversion_reused_workbuf;
7429 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7430 static bool reused_workbuf_in_use;
7433 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7434 multibyteness of returning buffer. */
7436 static Lisp_Object
7437 make_conversion_work_buffer (bool multibyte)
7439 Lisp_Object name, workbuf;
7440 struct buffer *current;
7442 if (reused_workbuf_in_use)
7444 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7445 workbuf = Fget_buffer_create (name);
7447 else
7449 reused_workbuf_in_use = 1;
7450 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7451 Vcode_conversion_reused_workbuf
7452 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7453 workbuf = Vcode_conversion_reused_workbuf;
7455 current = current_buffer;
7456 set_buffer_internal (XBUFFER (workbuf));
7457 /* We can't allow modification hooks to run in the work buffer. For
7458 instance, directory_files_internal assumes that file decoding
7459 doesn't compile new regexps. */
7460 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7461 Ferase_buffer ();
7462 bset_undo_list (current_buffer, Qt);
7463 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7464 set_buffer_internal (current);
7465 return workbuf;
7469 static Lisp_Object
7470 code_conversion_restore (Lisp_Object arg)
7472 Lisp_Object current, workbuf;
7473 struct gcpro gcpro1;
7475 GCPRO1 (arg);
7476 current = XCAR (arg);
7477 workbuf = XCDR (arg);
7478 if (! NILP (workbuf))
7480 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7481 reused_workbuf_in_use = 0;
7482 else
7483 Fkill_buffer (workbuf);
7485 set_buffer_internal (XBUFFER (current));
7486 UNGCPRO;
7487 return Qnil;
7490 Lisp_Object
7491 code_conversion_save (bool with_work_buf, bool multibyte)
7493 Lisp_Object workbuf = Qnil;
7495 if (with_work_buf)
7496 workbuf = make_conversion_work_buffer (multibyte);
7497 record_unwind_protect (code_conversion_restore,
7498 Fcons (Fcurrent_buffer (), workbuf));
7499 return workbuf;
7502 void
7503 decode_coding_gap (struct coding_system *coding,
7504 ptrdiff_t chars, ptrdiff_t bytes)
7506 ptrdiff_t count = SPECPDL_INDEX ();
7507 Lisp_Object attrs;
7509 code_conversion_save (0, 0);
7511 coding->src_object = Fcurrent_buffer ();
7512 coding->src_chars = chars;
7513 coding->src_bytes = bytes;
7514 coding->src_pos = -chars;
7515 coding->src_pos_byte = -bytes;
7516 coding->src_multibyte = chars < bytes;
7517 coding->dst_object = coding->src_object;
7518 coding->dst_pos = PT;
7519 coding->dst_pos_byte = PT_BYTE;
7520 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7522 if (CODING_REQUIRE_DETECTION (coding))
7523 detect_coding (coding);
7525 coding->mode |= CODING_MODE_LAST_BLOCK;
7526 current_buffer->text->inhibit_shrinking = 1;
7527 decode_coding (coding);
7528 current_buffer->text->inhibit_shrinking = 0;
7530 attrs = CODING_ID_ATTRS (coding->id);
7531 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7533 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7534 Lisp_Object val;
7536 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7537 val = call1 (CODING_ATTR_POST_READ (attrs),
7538 make_number (coding->produced_char));
7539 CHECK_NATNUM (val);
7540 coding->produced_char += Z - prev_Z;
7541 coding->produced += Z_BYTE - prev_Z_BYTE;
7544 unbind_to (count, Qnil);
7548 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7549 SRC_OBJECT into DST_OBJECT by coding context CODING.
7551 SRC_OBJECT is a buffer, a string, or Qnil.
7553 If it is a buffer, the text is at point of the buffer. FROM and TO
7554 are positions in the buffer.
7556 If it is a string, the text is at the beginning of the string.
7557 FROM and TO are indices to the string.
7559 If it is nil, the text is at coding->source. FROM and TO are
7560 indices to coding->source.
7562 DST_OBJECT is a buffer, Qt, or Qnil.
7564 If it is a buffer, the decoded text is inserted at point of the
7565 buffer. If the buffer is the same as SRC_OBJECT, the source text
7566 is deleted.
7568 If it is Qt, a string is made from the decoded text, and
7569 set in CODING->dst_object.
7571 If it is Qnil, the decoded text is stored at CODING->destination.
7572 The caller must allocate CODING->dst_bytes bytes at
7573 CODING->destination by xmalloc. If the decoded text is longer than
7574 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7577 void
7578 decode_coding_object (struct coding_system *coding,
7579 Lisp_Object src_object,
7580 ptrdiff_t from, ptrdiff_t from_byte,
7581 ptrdiff_t to, ptrdiff_t to_byte,
7582 Lisp_Object dst_object)
7584 ptrdiff_t count = SPECPDL_INDEX ();
7585 unsigned char *destination IF_LINT (= NULL);
7586 ptrdiff_t dst_bytes IF_LINT (= 0);
7587 ptrdiff_t chars = to - from;
7588 ptrdiff_t bytes = to_byte - from_byte;
7589 Lisp_Object attrs;
7590 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7591 bool need_marker_adjustment = 0;
7592 Lisp_Object old_deactivate_mark;
7594 old_deactivate_mark = Vdeactivate_mark;
7596 if (NILP (dst_object))
7598 destination = coding->destination;
7599 dst_bytes = coding->dst_bytes;
7602 coding->src_object = src_object;
7603 coding->src_chars = chars;
7604 coding->src_bytes = bytes;
7605 coding->src_multibyte = chars < bytes;
7607 if (STRINGP (src_object))
7609 coding->src_pos = from;
7610 coding->src_pos_byte = from_byte;
7612 else if (BUFFERP (src_object))
7614 set_buffer_internal (XBUFFER (src_object));
7615 if (from != GPT)
7616 move_gap_both (from, from_byte);
7617 if (EQ (src_object, dst_object))
7619 struct Lisp_Marker *tail;
7621 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7623 tail->need_adjustment
7624 = tail->charpos == (tail->insertion_type ? from : to);
7625 need_marker_adjustment |= tail->need_adjustment;
7627 saved_pt = PT, saved_pt_byte = PT_BYTE;
7628 TEMP_SET_PT_BOTH (from, from_byte);
7629 current_buffer->text->inhibit_shrinking = 1;
7630 del_range_both (from, from_byte, to, to_byte, 1);
7631 coding->src_pos = -chars;
7632 coding->src_pos_byte = -bytes;
7634 else
7636 coding->src_pos = from;
7637 coding->src_pos_byte = from_byte;
7641 if (CODING_REQUIRE_DETECTION (coding))
7642 detect_coding (coding);
7643 attrs = CODING_ID_ATTRS (coding->id);
7645 if (EQ (dst_object, Qt)
7646 || (! NILP (CODING_ATTR_POST_READ (attrs))
7647 && NILP (dst_object)))
7649 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7650 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7651 coding->dst_pos = BEG;
7652 coding->dst_pos_byte = BEG_BYTE;
7654 else if (BUFFERP (dst_object))
7656 code_conversion_save (0, 0);
7657 coding->dst_object = dst_object;
7658 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7659 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7660 coding->dst_multibyte
7661 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7663 else
7665 code_conversion_save (0, 0);
7666 coding->dst_object = Qnil;
7667 /* Most callers presume this will return a multibyte result, and they
7668 won't use `binary' or `raw-text' anyway, so let's not worry about
7669 CODING_FOR_UNIBYTE. */
7670 coding->dst_multibyte = 1;
7673 decode_coding (coding);
7675 if (BUFFERP (coding->dst_object))
7676 set_buffer_internal (XBUFFER (coding->dst_object));
7678 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7680 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7681 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7682 Lisp_Object val;
7684 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7685 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7686 old_deactivate_mark);
7687 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7688 make_number (coding->produced_char));
7689 UNGCPRO;
7690 CHECK_NATNUM (val);
7691 coding->produced_char += Z - prev_Z;
7692 coding->produced += Z_BYTE - prev_Z_BYTE;
7695 if (EQ (dst_object, Qt))
7697 coding->dst_object = Fbuffer_string ();
7699 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7701 set_buffer_internal (XBUFFER (coding->dst_object));
7702 if (dst_bytes < coding->produced)
7704 destination = xrealloc (destination, coding->produced);
7705 if (! destination)
7707 record_conversion_result (coding,
7708 CODING_RESULT_INSUFFICIENT_MEM);
7709 unbind_to (count, Qnil);
7710 return;
7712 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7713 move_gap_both (BEGV, BEGV_BYTE);
7714 memcpy (destination, BEGV_ADDR, coding->produced);
7715 coding->destination = destination;
7719 if (saved_pt >= 0)
7721 /* This is the case of:
7722 (BUFFERP (src_object) && EQ (src_object, dst_object))
7723 As we have moved PT while replacing the original buffer
7724 contents, we must recover it now. */
7725 set_buffer_internal (XBUFFER (src_object));
7726 current_buffer->text->inhibit_shrinking = 0;
7727 if (saved_pt < from)
7728 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7729 else if (saved_pt < from + chars)
7730 TEMP_SET_PT_BOTH (from, from_byte);
7731 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7732 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7733 saved_pt_byte + (coding->produced - bytes));
7734 else
7735 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7736 saved_pt_byte + (coding->produced - bytes));
7738 if (need_marker_adjustment)
7740 struct Lisp_Marker *tail;
7742 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7743 if (tail->need_adjustment)
7745 tail->need_adjustment = 0;
7746 if (tail->insertion_type)
7748 tail->bytepos = from_byte;
7749 tail->charpos = from;
7751 else
7753 tail->bytepos = from_byte + coding->produced;
7754 tail->charpos
7755 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7756 ? tail->bytepos : from + coding->produced_char);
7762 Vdeactivate_mark = old_deactivate_mark;
7763 unbind_to (count, coding->dst_object);
7767 void
7768 encode_coding_object (struct coding_system *coding,
7769 Lisp_Object src_object,
7770 ptrdiff_t from, ptrdiff_t from_byte,
7771 ptrdiff_t to, ptrdiff_t to_byte,
7772 Lisp_Object dst_object)
7774 ptrdiff_t count = SPECPDL_INDEX ();
7775 ptrdiff_t chars = to - from;
7776 ptrdiff_t bytes = to_byte - from_byte;
7777 Lisp_Object attrs;
7778 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7779 bool need_marker_adjustment = 0;
7780 bool kill_src_buffer = 0;
7781 Lisp_Object old_deactivate_mark;
7783 old_deactivate_mark = Vdeactivate_mark;
7785 coding->src_object = src_object;
7786 coding->src_chars = chars;
7787 coding->src_bytes = bytes;
7788 coding->src_multibyte = chars < bytes;
7790 attrs = CODING_ID_ATTRS (coding->id);
7792 if (EQ (src_object, dst_object))
7794 struct Lisp_Marker *tail;
7796 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7798 tail->need_adjustment
7799 = tail->charpos == (tail->insertion_type ? from : to);
7800 need_marker_adjustment |= tail->need_adjustment;
7804 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7806 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7807 set_buffer_internal (XBUFFER (coding->src_object));
7808 if (STRINGP (src_object))
7809 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7810 else if (BUFFERP (src_object))
7811 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7812 else
7813 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7815 if (EQ (src_object, dst_object))
7817 set_buffer_internal (XBUFFER (src_object));
7818 saved_pt = PT, saved_pt_byte = PT_BYTE;
7819 del_range_both (from, from_byte, to, to_byte, 1);
7820 set_buffer_internal (XBUFFER (coding->src_object));
7824 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7826 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7827 old_deactivate_mark);
7828 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7829 make_number (BEG), make_number (Z));
7830 UNGCPRO;
7832 if (XBUFFER (coding->src_object) != current_buffer)
7833 kill_src_buffer = 1;
7834 coding->src_object = Fcurrent_buffer ();
7835 if (BEG != GPT)
7836 move_gap_both (BEG, BEG_BYTE);
7837 coding->src_chars = Z - BEG;
7838 coding->src_bytes = Z_BYTE - BEG_BYTE;
7839 coding->src_pos = BEG;
7840 coding->src_pos_byte = BEG_BYTE;
7841 coding->src_multibyte = Z < Z_BYTE;
7843 else if (STRINGP (src_object))
7845 code_conversion_save (0, 0);
7846 coding->src_pos = from;
7847 coding->src_pos_byte = from_byte;
7849 else if (BUFFERP (src_object))
7851 code_conversion_save (0, 0);
7852 set_buffer_internal (XBUFFER (src_object));
7853 if (EQ (src_object, dst_object))
7855 saved_pt = PT, saved_pt_byte = PT_BYTE;
7856 coding->src_object = del_range_1 (from, to, 1, 1);
7857 coding->src_pos = 0;
7858 coding->src_pos_byte = 0;
7860 else
7862 if (from < GPT && to >= GPT)
7863 move_gap_both (from, from_byte);
7864 coding->src_pos = from;
7865 coding->src_pos_byte = from_byte;
7868 else
7869 code_conversion_save (0, 0);
7871 if (BUFFERP (dst_object))
7873 coding->dst_object = dst_object;
7874 if (EQ (src_object, dst_object))
7876 coding->dst_pos = from;
7877 coding->dst_pos_byte = from_byte;
7879 else
7881 struct buffer *current = current_buffer;
7883 set_buffer_temp (XBUFFER (dst_object));
7884 coding->dst_pos = PT;
7885 coding->dst_pos_byte = PT_BYTE;
7886 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7887 set_buffer_temp (current);
7889 coding->dst_multibyte
7890 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7892 else if (EQ (dst_object, Qt))
7894 ptrdiff_t dst_bytes = max (1, coding->src_chars);
7895 coding->dst_object = Qnil;
7896 coding->destination = xmalloc (dst_bytes);
7897 coding->dst_bytes = dst_bytes;
7898 coding->dst_multibyte = 0;
7900 else
7902 coding->dst_object = Qnil;
7903 coding->dst_multibyte = 0;
7906 encode_coding (coding);
7908 if (EQ (dst_object, Qt))
7910 if (BUFFERP (coding->dst_object))
7911 coding->dst_object = Fbuffer_string ();
7912 else
7914 coding->dst_object
7915 = make_unibyte_string ((char *) coding->destination,
7916 coding->produced);
7917 xfree (coding->destination);
7921 if (saved_pt >= 0)
7923 /* This is the case of:
7924 (BUFFERP (src_object) && EQ (src_object, dst_object))
7925 As we have moved PT while replacing the original buffer
7926 contents, we must recover it now. */
7927 set_buffer_internal (XBUFFER (src_object));
7928 if (saved_pt < from)
7929 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7930 else if (saved_pt < from + chars)
7931 TEMP_SET_PT_BOTH (from, from_byte);
7932 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7933 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7934 saved_pt_byte + (coding->produced - bytes));
7935 else
7936 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7937 saved_pt_byte + (coding->produced - bytes));
7939 if (need_marker_adjustment)
7941 struct Lisp_Marker *tail;
7943 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7944 if (tail->need_adjustment)
7946 tail->need_adjustment = 0;
7947 if (tail->insertion_type)
7949 tail->bytepos = from_byte;
7950 tail->charpos = from;
7952 else
7954 tail->bytepos = from_byte + coding->produced;
7955 tail->charpos
7956 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7957 ? tail->bytepos : from + coding->produced_char);
7963 if (kill_src_buffer)
7964 Fkill_buffer (coding->src_object);
7966 Vdeactivate_mark = old_deactivate_mark;
7967 unbind_to (count, Qnil);
7971 Lisp_Object
7972 preferred_coding_system (void)
7974 int id = coding_categories[coding_priorities[0]].id;
7976 return CODING_ID_NAME (id);
7979 #if defined (WINDOWSNT) || defined (CYGWIN)
7981 Lisp_Object
7982 from_unicode (Lisp_Object str)
7984 CHECK_STRING (str);
7985 if (!STRING_MULTIBYTE (str) &&
7986 SBYTES (str) & 1)
7988 str = Fsubstring (str, make_number (0), make_number (-1));
7991 return code_convert_string_norecord (str, Qutf_16le, 0);
7994 wchar_t *
7995 to_unicode (Lisp_Object str, Lisp_Object *buf)
7997 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7998 /* We need to make a another copy (in addition to the one made by
7999 code_convert_string_norecord) to ensure that the final string is
8000 _doubly_ zero terminated --- that is, that the string is
8001 terminated by two zero bytes and one utf-16le null character.
8002 Because strings are already terminated with a single zero byte,
8003 we just add one additional zero. */
8004 str = make_uninit_string (SBYTES (*buf) + 1);
8005 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8006 SDATA (str) [SBYTES (*buf)] = '\0';
8007 *buf = str;
8008 return WCSDATA (*buf);
8011 #endif /* WINDOWSNT || CYGWIN */
8014 #ifdef emacs
8015 /*** 8. Emacs Lisp library functions ***/
8017 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8018 doc: /* Return t if OBJECT is nil or a coding-system.
8019 See the documentation of `define-coding-system' for information
8020 about coding-system objects. */)
8021 (Lisp_Object object)
8023 if (NILP (object)
8024 || CODING_SYSTEM_ID (object) >= 0)
8025 return Qt;
8026 if (! SYMBOLP (object)
8027 || NILP (Fget (object, Qcoding_system_define_form)))
8028 return Qnil;
8029 return Qt;
8032 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8033 Sread_non_nil_coding_system, 1, 1, 0,
8034 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8035 (Lisp_Object prompt)
8037 Lisp_Object val;
8040 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8041 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8043 while (SCHARS (val) == 0);
8044 return (Fintern (val, Qnil));
8047 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8048 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8049 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8050 Ignores case when completing coding systems (all Emacs coding systems
8051 are lower-case). */)
8052 (Lisp_Object prompt, Lisp_Object default_coding_system)
8054 Lisp_Object val;
8055 ptrdiff_t count = SPECPDL_INDEX ();
8057 if (SYMBOLP (default_coding_system))
8058 default_coding_system = SYMBOL_NAME (default_coding_system);
8059 specbind (Qcompletion_ignore_case, Qt);
8060 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8061 Qt, Qnil, Qcoding_system_history,
8062 default_coding_system, Qnil);
8063 unbind_to (count, Qnil);
8064 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8067 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8068 1, 1, 0,
8069 doc: /* Check validity of CODING-SYSTEM.
8070 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8071 It is valid if it is nil or a symbol defined as a coding system by the
8072 function `define-coding-system'. */)
8073 (Lisp_Object coding_system)
8075 Lisp_Object define_form;
8077 define_form = Fget (coding_system, Qcoding_system_define_form);
8078 if (! NILP (define_form))
8080 Fput (coding_system, Qcoding_system_define_form, Qnil);
8081 safe_eval (define_form);
8083 if (!NILP (Fcoding_system_p (coding_system)))
8084 return coding_system;
8085 xsignal1 (Qcoding_system_error, coding_system);
8089 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8090 HIGHEST, return the coding system of the highest
8091 priority among the detected coding systems. Otherwise return a
8092 list of detected coding systems sorted by their priorities. If
8093 MULTIBYTEP, it is assumed that the bytes are in correct
8094 multibyte form but contains only ASCII and eight-bit chars.
8095 Otherwise, the bytes are raw bytes.
8097 CODING-SYSTEM controls the detection as below:
8099 If it is nil, detect both text-format and eol-format. If the
8100 text-format part of CODING-SYSTEM is already specified
8101 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8102 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8103 detect only text-format. */
8105 Lisp_Object
8106 detect_coding_system (const unsigned char *src,
8107 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8108 bool highest, bool multibytep,
8109 Lisp_Object coding_system)
8111 const unsigned char *src_end = src + src_bytes;
8112 Lisp_Object attrs, eol_type;
8113 Lisp_Object val = Qnil;
8114 struct coding_system coding;
8115 ptrdiff_t id;
8116 struct coding_detection_info detect_info;
8117 enum coding_category base_category;
8118 bool null_byte_found = 0, eight_bit_found = 0;
8120 if (NILP (coding_system))
8121 coding_system = Qundecided;
8122 setup_coding_system (coding_system, &coding);
8123 attrs = CODING_ID_ATTRS (coding.id);
8124 eol_type = CODING_ID_EOL_TYPE (coding.id);
8125 coding_system = CODING_ATTR_BASE_NAME (attrs);
8127 coding.source = src;
8128 coding.src_chars = src_chars;
8129 coding.src_bytes = src_bytes;
8130 coding.src_multibyte = multibytep;
8131 coding.consumed = 0;
8132 coding.mode |= CODING_MODE_LAST_BLOCK;
8133 coding.head_ascii = 0;
8135 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8137 /* At first, detect text-format if necessary. */
8138 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8139 if (base_category == coding_category_undecided)
8141 enum coding_category category IF_LINT (= 0);
8142 struct coding_system *this IF_LINT (= NULL);
8143 int c, i;
8145 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8146 for (; src < src_end; src++)
8148 c = *src;
8149 if (c & 0x80)
8151 eight_bit_found = 1;
8152 if (null_byte_found)
8153 break;
8155 else if (c < 0x20)
8157 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8158 && ! inhibit_iso_escape_detection
8159 && ! detect_info.checked)
8161 if (detect_coding_iso_2022 (&coding, &detect_info))
8163 /* We have scanned the whole data. */
8164 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8166 /* We didn't find an 8-bit code. We may
8167 have found a null-byte, but it's very
8168 rare that a binary file confirm to
8169 ISO-2022. */
8170 src = src_end;
8171 coding.head_ascii = src - coding.source;
8173 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8174 break;
8177 else if (! c && !inhibit_null_byte_detection)
8179 null_byte_found = 1;
8180 if (eight_bit_found)
8181 break;
8183 if (! eight_bit_found)
8184 coding.head_ascii++;
8186 else if (! eight_bit_found)
8187 coding.head_ascii++;
8190 if (null_byte_found || eight_bit_found
8191 || coding.head_ascii < coding.src_bytes
8192 || detect_info.found)
8194 if (coding.head_ascii == coding.src_bytes)
8195 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8196 for (i = 0; i < coding_category_raw_text; i++)
8198 category = coding_priorities[i];
8199 this = coding_categories + category;
8200 if (detect_info.found & (1 << category))
8201 break;
8203 else
8205 if (null_byte_found)
8207 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8208 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8210 for (i = 0; i < coding_category_raw_text; i++)
8212 category = coding_priorities[i];
8213 this = coding_categories + category;
8215 if (this->id < 0)
8217 /* No coding system of this category is defined. */
8218 detect_info.rejected |= (1 << category);
8220 else if (category >= coding_category_raw_text)
8221 continue;
8222 else if (detect_info.checked & (1 << category))
8224 if (highest
8225 && (detect_info.found & (1 << category)))
8226 break;
8228 else if ((*(this->detector)) (&coding, &detect_info)
8229 && highest
8230 && (detect_info.found & (1 << category)))
8232 if (category == coding_category_utf_16_auto)
8234 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8235 category = coding_category_utf_16_le;
8236 else
8237 category = coding_category_utf_16_be;
8239 break;
8245 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8246 || null_byte_found)
8248 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8249 id = CODING_SYSTEM_ID (Qno_conversion);
8250 val = Fcons (make_number (id), Qnil);
8252 else if (! detect_info.rejected && ! detect_info.found)
8254 detect_info.found = CATEGORY_MASK_ANY;
8255 id = coding_categories[coding_category_undecided].id;
8256 val = Fcons (make_number (id), Qnil);
8258 else if (highest)
8260 if (detect_info.found)
8262 detect_info.found = 1 << category;
8263 val = Fcons (make_number (this->id), Qnil);
8265 else
8266 for (i = 0; i < coding_category_raw_text; i++)
8267 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8269 detect_info.found = 1 << coding_priorities[i];
8270 id = coding_categories[coding_priorities[i]].id;
8271 val = Fcons (make_number (id), Qnil);
8272 break;
8275 else
8277 int mask = detect_info.rejected | detect_info.found;
8278 int found = 0;
8280 for (i = coding_category_raw_text - 1; i >= 0; i--)
8282 category = coding_priorities[i];
8283 if (! (mask & (1 << category)))
8285 found |= 1 << category;
8286 id = coding_categories[category].id;
8287 if (id >= 0)
8288 val = Fcons (make_number (id), val);
8291 for (i = coding_category_raw_text - 1; i >= 0; i--)
8293 category = coding_priorities[i];
8294 if (detect_info.found & (1 << category))
8296 id = coding_categories[category].id;
8297 val = Fcons (make_number (id), val);
8300 detect_info.found |= found;
8303 else if (base_category == coding_category_utf_8_auto)
8305 if (detect_coding_utf_8 (&coding, &detect_info))
8307 struct coding_system *this;
8309 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8310 this = coding_categories + coding_category_utf_8_sig;
8311 else
8312 this = coding_categories + coding_category_utf_8_nosig;
8313 val = Fcons (make_number (this->id), Qnil);
8316 else if (base_category == coding_category_utf_16_auto)
8318 if (detect_coding_utf_16 (&coding, &detect_info))
8320 struct coding_system *this;
8322 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8323 this = coding_categories + coding_category_utf_16_le;
8324 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8325 this = coding_categories + coding_category_utf_16_be;
8326 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8327 this = coding_categories + coding_category_utf_16_be_nosig;
8328 else
8329 this = coding_categories + coding_category_utf_16_le_nosig;
8330 val = Fcons (make_number (this->id), Qnil);
8333 else
8335 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8336 val = Fcons (make_number (coding.id), Qnil);
8339 /* Then, detect eol-format if necessary. */
8341 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8342 Lisp_Object tail;
8344 if (VECTORP (eol_type))
8346 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8348 if (null_byte_found)
8349 normal_eol = EOL_SEEN_LF;
8350 else
8351 normal_eol = detect_eol (coding.source, src_bytes,
8352 coding_category_raw_text);
8354 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8355 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8356 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8357 coding_category_utf_16_be);
8358 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8359 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8360 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8361 coding_category_utf_16_le);
8363 else
8365 if (EQ (eol_type, Qunix))
8366 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8367 else if (EQ (eol_type, Qdos))
8368 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8369 else
8370 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8373 for (tail = val; CONSP (tail); tail = XCDR (tail))
8375 enum coding_category category;
8376 int this_eol;
8378 id = XINT (XCAR (tail));
8379 attrs = CODING_ID_ATTRS (id);
8380 category = XINT (CODING_ATTR_CATEGORY (attrs));
8381 eol_type = CODING_ID_EOL_TYPE (id);
8382 if (VECTORP (eol_type))
8384 if (category == coding_category_utf_16_be
8385 || category == coding_category_utf_16_be_nosig)
8386 this_eol = utf_16_be_eol;
8387 else if (category == coding_category_utf_16_le
8388 || category == coding_category_utf_16_le_nosig)
8389 this_eol = utf_16_le_eol;
8390 else
8391 this_eol = normal_eol;
8393 if (this_eol == EOL_SEEN_LF)
8394 XSETCAR (tail, AREF (eol_type, 0));
8395 else if (this_eol == EOL_SEEN_CRLF)
8396 XSETCAR (tail, AREF (eol_type, 1));
8397 else if (this_eol == EOL_SEEN_CR)
8398 XSETCAR (tail, AREF (eol_type, 2));
8399 else
8400 XSETCAR (tail, CODING_ID_NAME (id));
8402 else
8403 XSETCAR (tail, CODING_ID_NAME (id));
8407 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8411 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8412 2, 3, 0,
8413 doc: /* Detect coding system of the text in the region between START and END.
8414 Return a list of possible coding systems ordered by priority.
8415 The coding systems to try and their priorities follows what
8416 the function `coding-system-priority-list' (which see) returns.
8418 If only ASCII characters are found (except for such ISO-2022 control
8419 characters as ESC), it returns a list of single element `undecided'
8420 or its subsidiary coding system according to a detected end-of-line
8421 format.
8423 If optional argument HIGHEST is non-nil, return the coding system of
8424 highest priority. */)
8425 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8427 ptrdiff_t from, to;
8428 ptrdiff_t from_byte, to_byte;
8430 CHECK_NUMBER_COERCE_MARKER (start);
8431 CHECK_NUMBER_COERCE_MARKER (end);
8433 validate_region (&start, &end);
8434 from = XINT (start), to = XINT (end);
8435 from_byte = CHAR_TO_BYTE (from);
8436 to_byte = CHAR_TO_BYTE (to);
8438 if (from < GPT && to >= GPT)
8439 move_gap_both (to, to_byte);
8441 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8442 to - from, to_byte - from_byte,
8443 !NILP (highest),
8444 !NILP (BVAR (current_buffer
8445 , enable_multibyte_characters)),
8446 Qnil);
8449 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8450 1, 2, 0,
8451 doc: /* Detect coding system of the text in STRING.
8452 Return a list of possible coding systems ordered by priority.
8453 The coding systems to try and their priorities follows what
8454 the function `coding-system-priority-list' (which see) returns.
8456 If only ASCII characters are found (except for such ISO-2022 control
8457 characters as ESC), it returns a list of single element `undecided'
8458 or its subsidiary coding system according to a detected end-of-line
8459 format.
8461 If optional argument HIGHEST is non-nil, return the coding system of
8462 highest priority. */)
8463 (Lisp_Object string, Lisp_Object highest)
8465 CHECK_STRING (string);
8467 return detect_coding_system (SDATA (string),
8468 SCHARS (string), SBYTES (string),
8469 !NILP (highest), STRING_MULTIBYTE (string),
8470 Qnil);
8474 static bool
8475 char_encodable_p (int c, Lisp_Object attrs)
8477 Lisp_Object tail;
8478 struct charset *charset;
8479 Lisp_Object translation_table;
8481 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8482 if (! NILP (translation_table))
8483 c = translate_char (translation_table, c);
8484 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8485 CONSP (tail); tail = XCDR (tail))
8487 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8488 if (CHAR_CHARSET_P (c, charset))
8489 break;
8491 return (! NILP (tail));
8495 /* Return a list of coding systems that safely encode the text between
8496 START and END. If EXCLUDE is non-nil, it is a list of coding
8497 systems not to check. The returned list doesn't contain any such
8498 coding systems. In any case, if the text contains only ASCII or is
8499 unibyte, return t. */
8501 DEFUN ("find-coding-systems-region-internal",
8502 Ffind_coding_systems_region_internal,
8503 Sfind_coding_systems_region_internal, 2, 3, 0,
8504 doc: /* Internal use only. */)
8505 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8507 Lisp_Object coding_attrs_list, safe_codings;
8508 ptrdiff_t start_byte, end_byte;
8509 const unsigned char *p, *pbeg, *pend;
8510 int c;
8511 Lisp_Object tail, elt, work_table;
8513 if (STRINGP (start))
8515 if (!STRING_MULTIBYTE (start)
8516 || SCHARS (start) == SBYTES (start))
8517 return Qt;
8518 start_byte = 0;
8519 end_byte = SBYTES (start);
8521 else
8523 CHECK_NUMBER_COERCE_MARKER (start);
8524 CHECK_NUMBER_COERCE_MARKER (end);
8525 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8526 args_out_of_range (start, end);
8527 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8528 return Qt;
8529 start_byte = CHAR_TO_BYTE (XINT (start));
8530 end_byte = CHAR_TO_BYTE (XINT (end));
8531 if (XINT (end) - XINT (start) == end_byte - start_byte)
8532 return Qt;
8534 if (XINT (start) < GPT && XINT (end) > GPT)
8536 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8537 move_gap_both (XINT (start), start_byte);
8538 else
8539 move_gap_both (XINT (end), end_byte);
8543 coding_attrs_list = Qnil;
8544 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8545 if (NILP (exclude)
8546 || NILP (Fmemq (XCAR (tail), exclude)))
8548 Lisp_Object attrs;
8550 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8551 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8552 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8554 ASET (attrs, coding_attr_trans_tbl,
8555 get_translation_table (attrs, 1, NULL));
8556 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8560 if (STRINGP (start))
8561 p = pbeg = SDATA (start);
8562 else
8563 p = pbeg = BYTE_POS_ADDR (start_byte);
8564 pend = p + (end_byte - start_byte);
8566 while (p < pend && ASCII_BYTE_P (*p)) p++;
8567 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8569 work_table = Fmake_char_table (Qnil, Qnil);
8570 while (p < pend)
8572 if (ASCII_BYTE_P (*p))
8573 p++;
8574 else
8576 c = STRING_CHAR_ADVANCE (p);
8577 if (!NILP (char_table_ref (work_table, c)))
8578 /* This character was already checked. Ignore it. */
8579 continue;
8581 charset_map_loaded = 0;
8582 for (tail = coding_attrs_list; CONSP (tail);)
8584 elt = XCAR (tail);
8585 if (NILP (elt))
8586 tail = XCDR (tail);
8587 else if (char_encodable_p (c, elt))
8588 tail = XCDR (tail);
8589 else if (CONSP (XCDR (tail)))
8591 XSETCAR (tail, XCAR (XCDR (tail)));
8592 XSETCDR (tail, XCDR (XCDR (tail)));
8594 else
8596 XSETCAR (tail, Qnil);
8597 tail = XCDR (tail);
8600 if (charset_map_loaded)
8602 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8604 if (STRINGP (start))
8605 pbeg = SDATA (start);
8606 else
8607 pbeg = BYTE_POS_ADDR (start_byte);
8608 p = pbeg + p_offset;
8609 pend = pbeg + pend_offset;
8611 char_table_set (work_table, c, Qt);
8615 safe_codings = list2 (Qraw_text, Qno_conversion);
8616 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8617 if (! NILP (XCAR (tail)))
8618 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8620 return safe_codings;
8624 DEFUN ("unencodable-char-position", Funencodable_char_position,
8625 Sunencodable_char_position, 3, 5, 0,
8626 doc: /*
8627 Return position of first un-encodable character in a region.
8628 START and END specify the region and CODING-SYSTEM specifies the
8629 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8631 If optional 4th argument COUNT is non-nil, it specifies at most how
8632 many un-encodable characters to search. In this case, the value is a
8633 list of positions.
8635 If optional 5th argument STRING is non-nil, it is a string to search
8636 for un-encodable characters. In that case, START and END are indexes
8637 to the string. */)
8638 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8640 EMACS_INT n;
8641 struct coding_system coding;
8642 Lisp_Object attrs, charset_list, translation_table;
8643 Lisp_Object positions;
8644 ptrdiff_t from, to;
8645 const unsigned char *p, *stop, *pend;
8646 bool ascii_compatible;
8648 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8649 attrs = CODING_ID_ATTRS (coding.id);
8650 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8651 return Qnil;
8652 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8653 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8654 translation_table = get_translation_table (attrs, 1, NULL);
8656 if (NILP (string))
8658 validate_region (&start, &end);
8659 from = XINT (start);
8660 to = XINT (end);
8661 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8662 || (ascii_compatible
8663 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8664 return Qnil;
8665 p = CHAR_POS_ADDR (from);
8666 pend = CHAR_POS_ADDR (to);
8667 if (from < GPT && to >= GPT)
8668 stop = GPT_ADDR;
8669 else
8670 stop = pend;
8672 else
8674 CHECK_STRING (string);
8675 CHECK_NATNUM (start);
8676 CHECK_NATNUM (end);
8677 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8678 args_out_of_range_3 (string, start, end);
8679 from = XINT (start);
8680 to = XINT (end);
8681 if (! STRING_MULTIBYTE (string))
8682 return Qnil;
8683 p = SDATA (string) + string_char_to_byte (string, from);
8684 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8685 if (ascii_compatible && (to - from) == (pend - p))
8686 return Qnil;
8689 if (NILP (count))
8690 n = 1;
8691 else
8693 CHECK_NATNUM (count);
8694 n = XINT (count);
8697 positions = Qnil;
8698 charset_map_loaded = 0;
8699 while (1)
8701 int c;
8703 if (ascii_compatible)
8704 while (p < stop && ASCII_BYTE_P (*p))
8705 p++, from++;
8706 if (p >= stop)
8708 if (p >= pend)
8709 break;
8710 stop = pend;
8711 p = GAP_END_ADDR;
8714 c = STRING_CHAR_ADVANCE (p);
8715 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8716 && ! char_charset (translate_char (translation_table, c),
8717 charset_list, NULL))
8719 positions = Fcons (make_number (from), positions);
8720 n--;
8721 if (n == 0)
8722 break;
8725 from++;
8726 if (charset_map_loaded && NILP (string))
8728 p = CHAR_POS_ADDR (from);
8729 pend = CHAR_POS_ADDR (to);
8730 if (from < GPT && to >= GPT)
8731 stop = GPT_ADDR;
8732 else
8733 stop = pend;
8734 charset_map_loaded = 0;
8738 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8742 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8743 Scheck_coding_systems_region, 3, 3, 0,
8744 doc: /* Check if the region is encodable by coding systems.
8746 START and END are buffer positions specifying the region.
8747 CODING-SYSTEM-LIST is a list of coding systems to check.
8749 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8750 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8751 whole region, POS0, POS1, ... are buffer positions where non-encodable
8752 characters are found.
8754 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8755 value is nil.
8757 START may be a string. In that case, check if the string is
8758 encodable, and the value contains indices to the string instead of
8759 buffer positions. END is ignored.
8761 If the current buffer (or START if it is a string) is unibyte, the value
8762 is nil. */)
8763 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8765 Lisp_Object list;
8766 ptrdiff_t start_byte, end_byte;
8767 ptrdiff_t pos;
8768 const unsigned char *p, *pbeg, *pend;
8769 int c;
8770 Lisp_Object tail, elt, attrs;
8772 if (STRINGP (start))
8774 if (!STRING_MULTIBYTE (start)
8775 || SCHARS (start) == SBYTES (start))
8776 return Qnil;
8777 start_byte = 0;
8778 end_byte = SBYTES (start);
8779 pos = 0;
8781 else
8783 CHECK_NUMBER_COERCE_MARKER (start);
8784 CHECK_NUMBER_COERCE_MARKER (end);
8785 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8786 args_out_of_range (start, end);
8787 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8788 return Qnil;
8789 start_byte = CHAR_TO_BYTE (XINT (start));
8790 end_byte = CHAR_TO_BYTE (XINT (end));
8791 if (XINT (end) - XINT (start) == end_byte - start_byte)
8792 return Qnil;
8794 if (XINT (start) < GPT && XINT (end) > GPT)
8796 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8797 move_gap_both (XINT (start), start_byte);
8798 else
8799 move_gap_both (XINT (end), end_byte);
8801 pos = XINT (start);
8804 list = Qnil;
8805 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8807 elt = XCAR (tail);
8808 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8809 ASET (attrs, coding_attr_trans_tbl,
8810 get_translation_table (attrs, 1, NULL));
8811 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8814 if (STRINGP (start))
8815 p = pbeg = SDATA (start);
8816 else
8817 p = pbeg = BYTE_POS_ADDR (start_byte);
8818 pend = p + (end_byte - start_byte);
8820 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8821 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8823 while (p < pend)
8825 if (ASCII_BYTE_P (*p))
8826 p++;
8827 else
8829 c = STRING_CHAR_ADVANCE (p);
8831 charset_map_loaded = 0;
8832 for (tail = list; CONSP (tail); tail = XCDR (tail))
8834 elt = XCDR (XCAR (tail));
8835 if (! char_encodable_p (c, XCAR (elt)))
8836 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8838 if (charset_map_loaded)
8840 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8842 if (STRINGP (start))
8843 pbeg = SDATA (start);
8844 else
8845 pbeg = BYTE_POS_ADDR (start_byte);
8846 p = pbeg + p_offset;
8847 pend = pbeg + pend_offset;
8850 pos++;
8853 tail = list;
8854 list = Qnil;
8855 for (; CONSP (tail); tail = XCDR (tail))
8857 elt = XCAR (tail);
8858 if (CONSP (XCDR (XCDR (elt))))
8859 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8860 list);
8863 return list;
8867 static Lisp_Object
8868 code_convert_region (Lisp_Object start, Lisp_Object end,
8869 Lisp_Object coding_system, Lisp_Object dst_object,
8870 bool encodep, bool norecord)
8872 struct coding_system coding;
8873 ptrdiff_t from, from_byte, to, to_byte;
8874 Lisp_Object src_object;
8876 CHECK_NUMBER_COERCE_MARKER (start);
8877 CHECK_NUMBER_COERCE_MARKER (end);
8878 if (NILP (coding_system))
8879 coding_system = Qno_conversion;
8880 else
8881 CHECK_CODING_SYSTEM (coding_system);
8882 src_object = Fcurrent_buffer ();
8883 if (NILP (dst_object))
8884 dst_object = src_object;
8885 else if (! EQ (dst_object, Qt))
8886 CHECK_BUFFER (dst_object);
8888 validate_region (&start, &end);
8889 from = XFASTINT (start);
8890 from_byte = CHAR_TO_BYTE (from);
8891 to = XFASTINT (end);
8892 to_byte = CHAR_TO_BYTE (to);
8894 setup_coding_system (coding_system, &coding);
8895 coding.mode |= CODING_MODE_LAST_BLOCK;
8897 if (encodep)
8898 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8899 dst_object);
8900 else
8901 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8902 dst_object);
8903 if (! norecord)
8904 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8906 return (BUFFERP (dst_object)
8907 ? make_number (coding.produced_char)
8908 : coding.dst_object);
8912 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8913 3, 4, "r\nzCoding system: ",
8914 doc: /* Decode the current region from the specified coding system.
8915 When called from a program, takes four arguments:
8916 START, END, CODING-SYSTEM, and DESTINATION.
8917 START and END are buffer positions.
8919 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8920 If nil, the region between START and END is replaced by the decoded text.
8921 If buffer, the decoded text is inserted in that buffer after point (point
8922 does not move).
8923 In those cases, the length of the decoded text is returned.
8924 If DESTINATION is t, the decoded text is returned.
8926 This function sets `last-coding-system-used' to the precise coding system
8927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8928 not fully specified.) */)
8929 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8931 return code_convert_region (start, end, coding_system, destination, 0, 0);
8934 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8935 3, 4, "r\nzCoding system: ",
8936 doc: /* Encode the current region by specified coding system.
8937 When called from a program, takes four arguments:
8938 START, END, CODING-SYSTEM and DESTINATION.
8939 START and END are buffer positions.
8941 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8942 If nil, the region between START and END is replace by the encoded text.
8943 If buffer, the encoded text is inserted in that buffer after point (point
8944 does not move).
8945 In those cases, the length of the encoded text is returned.
8946 If DESTINATION is t, the encoded text is returned.
8948 This function sets `last-coding-system-used' to the precise coding system
8949 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8950 not fully specified.) */)
8951 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8953 return code_convert_region (start, end, coding_system, destination, 1, 0);
8956 Lisp_Object
8957 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8958 Lisp_Object dst_object, bool encodep, bool nocopy,
8959 bool norecord)
8961 struct coding_system coding;
8962 ptrdiff_t chars, bytes;
8964 CHECK_STRING (string);
8965 if (NILP (coding_system))
8967 if (! norecord)
8968 Vlast_coding_system_used = Qno_conversion;
8969 if (NILP (dst_object))
8970 return (nocopy ? Fcopy_sequence (string) : string);
8973 if (NILP (coding_system))
8974 coding_system = Qno_conversion;
8975 else
8976 CHECK_CODING_SYSTEM (coding_system);
8977 if (NILP (dst_object))
8978 dst_object = Qt;
8979 else if (! EQ (dst_object, Qt))
8980 CHECK_BUFFER (dst_object);
8982 setup_coding_system (coding_system, &coding);
8983 coding.mode |= CODING_MODE_LAST_BLOCK;
8984 chars = SCHARS (string);
8985 bytes = SBYTES (string);
8986 if (encodep)
8987 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8988 else
8989 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8990 if (! norecord)
8991 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8993 return (BUFFERP (dst_object)
8994 ? make_number (coding.produced_char)
8995 : coding.dst_object);
8999 /* Encode or decode STRING according to CODING_SYSTEM.
9000 Do not set Vlast_coding_system_used.
9002 This function is called only from macros DECODE_FILE and
9003 ENCODE_FILE, thus we ignore character composition. */
9005 Lisp_Object
9006 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9007 bool encodep)
9009 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9013 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9014 2, 4, 0,
9015 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9017 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9018 if the decoding operation is trivial.
9020 Optional fourth arg BUFFER non-nil means that the decoded text is
9021 inserted in that buffer after point (point does not move). In this
9022 case, the return value is the length of the decoded text.
9024 This function sets `last-coding-system-used' to the precise coding system
9025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9026 not fully specified.) */)
9027 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9029 return code_convert_string (string, coding_system, buffer,
9030 0, ! NILP (nocopy), 0);
9033 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9034 2, 4, 0,
9035 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9037 Optional third arg NOCOPY non-nil means it is OK to return STRING
9038 itself if the encoding operation is trivial.
9040 Optional fourth arg BUFFER non-nil means that the encoded text is
9041 inserted in that buffer after point (point does not move). In this
9042 case, the return value is the length of the encoded text.
9044 This function sets `last-coding-system-used' to the precise coding system
9045 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9046 not fully specified.) */)
9047 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9049 return code_convert_string (string, coding_system, buffer,
9050 1, ! NILP (nocopy), 0);
9054 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9055 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9056 Return the corresponding character. */)
9057 (Lisp_Object code)
9059 Lisp_Object spec, attrs, val;
9060 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9061 EMACS_INT ch;
9062 int c;
9064 CHECK_NATNUM (code);
9065 ch = XFASTINT (code);
9066 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9067 attrs = AREF (spec, 0);
9069 if (ASCII_BYTE_P (ch)
9070 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9071 return code;
9073 val = CODING_ATTR_CHARSET_LIST (attrs);
9074 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9075 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9076 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9078 if (ch <= 0x7F)
9080 c = ch;
9081 charset = charset_roman;
9083 else if (ch >= 0xA0 && ch < 0xDF)
9085 c = ch - 0x80;
9086 charset = charset_kana;
9088 else
9090 EMACS_INT c1 = ch >> 8;
9091 int c2 = ch & 0xFF;
9093 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9094 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9095 error ("Invalid code: %"pI"d", ch);
9096 c = ch;
9097 SJIS_TO_JIS (c);
9098 charset = charset_kanji;
9100 c = DECODE_CHAR (charset, c);
9101 if (c < 0)
9102 error ("Invalid code: %"pI"d", ch);
9103 return make_number (c);
9107 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9108 doc: /* Encode a Japanese character CH to shift_jis encoding.
9109 Return the corresponding code in SJIS. */)
9110 (Lisp_Object ch)
9112 Lisp_Object spec, attrs, charset_list;
9113 int c;
9114 struct charset *charset;
9115 unsigned code;
9117 CHECK_CHARACTER (ch);
9118 c = XFASTINT (ch);
9119 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9120 attrs = AREF (spec, 0);
9122 if (ASCII_CHAR_P (c)
9123 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9124 return ch;
9126 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9127 charset = char_charset (c, charset_list, &code);
9128 if (code == CHARSET_INVALID_CODE (charset))
9129 error ("Can't encode by shift_jis encoding: %c", c);
9130 JIS_TO_SJIS (code);
9132 return make_number (code);
9135 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9136 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9137 Return the corresponding character. */)
9138 (Lisp_Object code)
9140 Lisp_Object spec, attrs, val;
9141 struct charset *charset_roman, *charset_big5, *charset;
9142 EMACS_INT ch;
9143 int c;
9145 CHECK_NATNUM (code);
9146 ch = XFASTINT (code);
9147 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9148 attrs = AREF (spec, 0);
9150 if (ASCII_BYTE_P (ch)
9151 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9152 return code;
9154 val = CODING_ATTR_CHARSET_LIST (attrs);
9155 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9156 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9158 if (ch <= 0x7F)
9160 c = ch;
9161 charset = charset_roman;
9163 else
9165 EMACS_INT b1 = ch >> 8;
9166 int b2 = ch & 0x7F;
9167 if (b1 < 0xA1 || b1 > 0xFE
9168 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9169 error ("Invalid code: %"pI"d", ch);
9170 c = ch;
9171 charset = charset_big5;
9173 c = DECODE_CHAR (charset, c);
9174 if (c < 0)
9175 error ("Invalid code: %"pI"d", ch);
9176 return make_number (c);
9179 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9180 doc: /* Encode the Big5 character CH to BIG5 coding system.
9181 Return the corresponding character code in Big5. */)
9182 (Lisp_Object ch)
9184 Lisp_Object spec, attrs, charset_list;
9185 struct charset *charset;
9186 int c;
9187 unsigned code;
9189 CHECK_CHARACTER (ch);
9190 c = XFASTINT (ch);
9191 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9192 attrs = AREF (spec, 0);
9193 if (ASCII_CHAR_P (c)
9194 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9195 return ch;
9197 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9198 charset = char_charset (c, charset_list, &code);
9199 if (code == CHARSET_INVALID_CODE (charset))
9200 error ("Can't encode by Big5 encoding: %c", c);
9202 return make_number (code);
9206 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9207 Sset_terminal_coding_system_internal, 1, 2, 0,
9208 doc: /* Internal use only. */)
9209 (Lisp_Object coding_system, Lisp_Object terminal)
9211 struct terminal *term = get_terminal (terminal, 1);
9212 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9213 CHECK_SYMBOL (coding_system);
9214 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9215 /* We had better not send unsafe characters to terminal. */
9216 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9217 /* Character composition should be disabled. */
9218 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9219 terminal_coding->src_multibyte = 1;
9220 terminal_coding->dst_multibyte = 0;
9221 tset_charset_list
9222 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9223 ? coding_charset_list (terminal_coding)
9224 : Fcons (make_number (charset_ascii), Qnil)));
9225 return Qnil;
9228 DEFUN ("set-safe-terminal-coding-system-internal",
9229 Fset_safe_terminal_coding_system_internal,
9230 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9231 doc: /* Internal use only. */)
9232 (Lisp_Object coding_system)
9234 CHECK_SYMBOL (coding_system);
9235 setup_coding_system (Fcheck_coding_system (coding_system),
9236 &safe_terminal_coding);
9237 /* Character composition should be disabled. */
9238 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9239 safe_terminal_coding.src_multibyte = 1;
9240 safe_terminal_coding.dst_multibyte = 0;
9241 return Qnil;
9244 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9245 Sterminal_coding_system, 0, 1, 0,
9246 doc: /* Return coding system specified for terminal output on the given terminal.
9247 TERMINAL may be a terminal object, a frame, or nil for the selected
9248 frame's terminal device. */)
9249 (Lisp_Object terminal)
9251 struct coding_system *terminal_coding
9252 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9253 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9255 /* For backward compatibility, return nil if it is `undecided'. */
9256 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9259 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9260 Sset_keyboard_coding_system_internal, 1, 2, 0,
9261 doc: /* Internal use only. */)
9262 (Lisp_Object coding_system, Lisp_Object terminal)
9264 struct terminal *t = get_terminal (terminal, 1);
9265 CHECK_SYMBOL (coding_system);
9266 if (NILP (coding_system))
9267 coding_system = Qno_conversion;
9268 else
9269 Fcheck_coding_system (coding_system);
9270 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9271 /* Character composition should be disabled. */
9272 TERMINAL_KEYBOARD_CODING (t)->common_flags
9273 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9274 return Qnil;
9277 DEFUN ("keyboard-coding-system",
9278 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9279 doc: /* Return coding system specified for decoding keyboard input. */)
9280 (Lisp_Object terminal)
9282 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9283 (get_terminal (terminal, 1))->id);
9287 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9288 Sfind_operation_coding_system, 1, MANY, 0,
9289 doc: /* Choose a coding system for an operation based on the target name.
9290 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9291 DECODING-SYSTEM is the coding system to use for decoding
9292 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9293 for encoding (in case OPERATION does encoding).
9295 The first argument OPERATION specifies an I/O primitive:
9296 For file I/O, `insert-file-contents' or `write-region'.
9297 For process I/O, `call-process', `call-process-region', or `start-process'.
9298 For network I/O, `open-network-stream'.
9300 The remaining arguments should be the same arguments that were passed
9301 to the primitive. Depending on which primitive, one of those arguments
9302 is selected as the TARGET. For example, if OPERATION does file I/O,
9303 whichever argument specifies the file name is TARGET.
9305 TARGET has a meaning which depends on OPERATION:
9306 For file I/O, TARGET is a file name (except for the special case below).
9307 For process I/O, TARGET is a process name.
9308 For network I/O, TARGET is a service name or a port number.
9310 This function looks up what is specified for TARGET in
9311 `file-coding-system-alist', `process-coding-system-alist',
9312 or `network-coding-system-alist' depending on OPERATION.
9313 They may specify a coding system, a cons of coding systems,
9314 or a function symbol to call.
9315 In the last case, we call the function with one argument,
9316 which is a list of all the arguments given to this function.
9317 If the function can't decide a coding system, it can return
9318 `undecided' so that the normal code-detection is performed.
9320 If OPERATION is `insert-file-contents', the argument corresponding to
9321 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9322 file name to look up, and BUFFER is a buffer that contains the file's
9323 contents (not yet decoded). If `file-coding-system-alist' specifies a
9324 function to call for FILENAME, that function should examine the
9325 contents of BUFFER instead of reading the file.
9327 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9328 (ptrdiff_t nargs, Lisp_Object *args)
9330 Lisp_Object operation, target_idx, target, val;
9331 register Lisp_Object chain;
9333 if (nargs < 2)
9334 error ("Too few arguments");
9335 operation = args[0];
9336 if (!SYMBOLP (operation)
9337 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9338 error ("Invalid first argument");
9339 if (nargs <= 1 + XFASTINT (target_idx))
9340 error ("Too few arguments for operation `%s'",
9341 SDATA (SYMBOL_NAME (operation)));
9342 target = args[XFASTINT (target_idx) + 1];
9343 if (!(STRINGP (target)
9344 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9345 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9346 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9347 error ("Invalid argument %"pI"d of operation `%s'",
9348 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9349 if (CONSP (target))
9350 target = XCAR (target);
9352 chain = ((EQ (operation, Qinsert_file_contents)
9353 || EQ (operation, Qwrite_region))
9354 ? Vfile_coding_system_alist
9355 : (EQ (operation, Qopen_network_stream)
9356 ? Vnetwork_coding_system_alist
9357 : Vprocess_coding_system_alist));
9358 if (NILP (chain))
9359 return Qnil;
9361 for (; CONSP (chain); chain = XCDR (chain))
9363 Lisp_Object elt;
9365 elt = XCAR (chain);
9366 if (CONSP (elt)
9367 && ((STRINGP (target)
9368 && STRINGP (XCAR (elt))
9369 && fast_string_match (XCAR (elt), target) >= 0)
9370 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9372 val = XCDR (elt);
9373 /* Here, if VAL is both a valid coding system and a valid
9374 function symbol, we return VAL as a coding system. */
9375 if (CONSP (val))
9376 return val;
9377 if (! SYMBOLP (val))
9378 return Qnil;
9379 if (! NILP (Fcoding_system_p (val)))
9380 return Fcons (val, val);
9381 if (! NILP (Ffboundp (val)))
9383 /* We use call1 rather than safe_call1
9384 so as to get bug reports about functions called here
9385 which don't handle the current interface. */
9386 val = call1 (val, Flist (nargs, args));
9387 if (CONSP (val))
9388 return val;
9389 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9390 return Fcons (val, val);
9392 return Qnil;
9395 return Qnil;
9398 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9399 Sset_coding_system_priority, 0, MANY, 0,
9400 doc: /* Assign higher priority to the coding systems given as arguments.
9401 If multiple coding systems belong to the same category,
9402 all but the first one are ignored.
9404 usage: (set-coding-system-priority &rest coding-systems) */)
9405 (ptrdiff_t nargs, Lisp_Object *args)
9407 ptrdiff_t i, j;
9408 bool changed[coding_category_max];
9409 enum coding_category priorities[coding_category_max];
9411 memset (changed, 0, sizeof changed);
9413 for (i = j = 0; i < nargs; i++)
9415 enum coding_category category;
9416 Lisp_Object spec, attrs;
9418 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9419 attrs = AREF (spec, 0);
9420 category = XINT (CODING_ATTR_CATEGORY (attrs));
9421 if (changed[category])
9422 /* Ignore this coding system because a coding system of the
9423 same category already had a higher priority. */
9424 continue;
9425 changed[category] = 1;
9426 priorities[j++] = category;
9427 if (coding_categories[category].id >= 0
9428 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9429 setup_coding_system (args[i], &coding_categories[category]);
9430 Fset (AREF (Vcoding_category_table, category), args[i]);
9433 /* Now we have decided top J priorities. Reflect the order of the
9434 original priorities to the remaining priorities. */
9436 for (i = j, j = 0; i < coding_category_max; i++, j++)
9438 while (j < coding_category_max
9439 && changed[coding_priorities[j]])
9440 j++;
9441 if (j == coding_category_max)
9442 emacs_abort ();
9443 priorities[i] = coding_priorities[j];
9446 memcpy (coding_priorities, priorities, sizeof priorities);
9448 /* Update `coding-category-list'. */
9449 Vcoding_category_list = Qnil;
9450 for (i = coding_category_max; i-- > 0; )
9451 Vcoding_category_list
9452 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9453 Vcoding_category_list);
9455 return Qnil;
9458 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9459 Scoding_system_priority_list, 0, 1, 0,
9460 doc: /* Return a list of coding systems ordered by their priorities.
9461 The list contains a subset of coding systems; i.e. coding systems
9462 assigned to each coding category (see `coding-category-list').
9464 HIGHESTP non-nil means just return the highest priority one. */)
9465 (Lisp_Object highestp)
9467 int i;
9468 Lisp_Object val;
9470 for (i = 0, val = Qnil; i < coding_category_max; i++)
9472 enum coding_category category = coding_priorities[i];
9473 int id = coding_categories[category].id;
9474 Lisp_Object attrs;
9476 if (id < 0)
9477 continue;
9478 attrs = CODING_ID_ATTRS (id);
9479 if (! NILP (highestp))
9480 return CODING_ATTR_BASE_NAME (attrs);
9481 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9483 return Fnreverse (val);
9486 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9488 static Lisp_Object
9489 make_subsidiaries (Lisp_Object base)
9491 Lisp_Object subsidiaries;
9492 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9493 char *buf = alloca (base_name_len + 6);
9494 int i;
9496 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9497 subsidiaries = Fmake_vector (make_number (3), Qnil);
9498 for (i = 0; i < 3; i++)
9500 strcpy (buf + base_name_len, suffixes[i]);
9501 ASET (subsidiaries, i, intern (buf));
9503 return subsidiaries;
9507 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9508 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9509 doc: /* For internal use only.
9510 usage: (define-coding-system-internal ...) */)
9511 (ptrdiff_t nargs, Lisp_Object *args)
9513 Lisp_Object name;
9514 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9515 Lisp_Object attrs; /* Vector of attributes. */
9516 Lisp_Object eol_type;
9517 Lisp_Object aliases;
9518 Lisp_Object coding_type, charset_list, safe_charsets;
9519 enum coding_category category;
9520 Lisp_Object tail, val;
9521 int max_charset_id = 0;
9522 int i;
9524 if (nargs < coding_arg_max)
9525 goto short_args;
9527 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9529 name = args[coding_arg_name];
9530 CHECK_SYMBOL (name);
9531 ASET (attrs, coding_attr_base_name, name);
9533 val = args[coding_arg_mnemonic];
9534 if (! STRINGP (val))
9535 CHECK_CHARACTER (val);
9536 ASET (attrs, coding_attr_mnemonic, val);
9538 coding_type = args[coding_arg_coding_type];
9539 CHECK_SYMBOL (coding_type);
9540 ASET (attrs, coding_attr_type, coding_type);
9542 charset_list = args[coding_arg_charset_list];
9543 if (SYMBOLP (charset_list))
9545 if (EQ (charset_list, Qiso_2022))
9547 if (! EQ (coding_type, Qiso_2022))
9548 error ("Invalid charset-list");
9549 charset_list = Viso_2022_charset_list;
9551 else if (EQ (charset_list, Qemacs_mule))
9553 if (! EQ (coding_type, Qemacs_mule))
9554 error ("Invalid charset-list");
9555 charset_list = Vemacs_mule_charset_list;
9557 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9559 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9560 error ("Invalid charset-list");
9561 if (max_charset_id < XFASTINT (XCAR (tail)))
9562 max_charset_id = XFASTINT (XCAR (tail));
9565 else
9567 charset_list = Fcopy_sequence (charset_list);
9568 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9570 struct charset *charset;
9572 val = XCAR (tail);
9573 CHECK_CHARSET_GET_CHARSET (val, charset);
9574 if (EQ (coding_type, Qiso_2022)
9575 ? CHARSET_ISO_FINAL (charset) < 0
9576 : EQ (coding_type, Qemacs_mule)
9577 ? CHARSET_EMACS_MULE_ID (charset) < 0
9578 : 0)
9579 error ("Can't handle charset `%s'",
9580 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9582 XSETCAR (tail, make_number (charset->id));
9583 if (max_charset_id < charset->id)
9584 max_charset_id = charset->id;
9587 ASET (attrs, coding_attr_charset_list, charset_list);
9589 safe_charsets = make_uninit_string (max_charset_id + 1);
9590 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9591 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9592 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9593 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9595 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9597 val = args[coding_arg_decode_translation_table];
9598 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9599 CHECK_SYMBOL (val);
9600 ASET (attrs, coding_attr_decode_tbl, val);
9602 val = args[coding_arg_encode_translation_table];
9603 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9604 CHECK_SYMBOL (val);
9605 ASET (attrs, coding_attr_encode_tbl, val);
9607 val = args[coding_arg_post_read_conversion];
9608 CHECK_SYMBOL (val);
9609 ASET (attrs, coding_attr_post_read, val);
9611 val = args[coding_arg_pre_write_conversion];
9612 CHECK_SYMBOL (val);
9613 ASET (attrs, coding_attr_pre_write, val);
9615 val = args[coding_arg_default_char];
9616 if (NILP (val))
9617 ASET (attrs, coding_attr_default_char, make_number (' '));
9618 else
9620 CHECK_CHARACTER (val);
9621 ASET (attrs, coding_attr_default_char, val);
9624 val = args[coding_arg_for_unibyte];
9625 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9627 val = args[coding_arg_plist];
9628 CHECK_LIST (val);
9629 ASET (attrs, coding_attr_plist, val);
9631 if (EQ (coding_type, Qcharset))
9633 /* Generate a lisp vector of 256 elements. Each element is nil,
9634 integer, or a list of charset IDs.
9636 If Nth element is nil, the byte code N is invalid in this
9637 coding system.
9639 If Nth element is a number NUM, N is the first byte of a
9640 charset whose ID is NUM.
9642 If Nth element is a list of charset IDs, N is the first byte
9643 of one of them. The list is sorted by dimensions of the
9644 charsets. A charset of smaller dimension comes first. */
9645 val = Fmake_vector (make_number (256), Qnil);
9647 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9649 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9650 int dim = CHARSET_DIMENSION (charset);
9651 int idx = (dim - 1) * 4;
9653 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9654 ASET (attrs, coding_attr_ascii_compat, Qt);
9656 for (i = charset->code_space[idx];
9657 i <= charset->code_space[idx + 1]; i++)
9659 Lisp_Object tmp, tmp2;
9660 int dim2;
9662 tmp = AREF (val, i);
9663 if (NILP (tmp))
9664 tmp = XCAR (tail);
9665 else if (NUMBERP (tmp))
9667 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9668 if (dim < dim2)
9669 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9670 else
9671 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9673 else
9675 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9677 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9678 if (dim < dim2)
9679 break;
9681 if (NILP (tmp2))
9682 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9683 else
9685 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9686 XSETCAR (tmp2, XCAR (tail));
9689 ASET (val, i, tmp);
9692 ASET (attrs, coding_attr_charset_valids, val);
9693 category = coding_category_charset;
9695 else if (EQ (coding_type, Qccl))
9697 Lisp_Object valids;
9699 if (nargs < coding_arg_ccl_max)
9700 goto short_args;
9702 val = args[coding_arg_ccl_decoder];
9703 CHECK_CCL_PROGRAM (val);
9704 if (VECTORP (val))
9705 val = Fcopy_sequence (val);
9706 ASET (attrs, coding_attr_ccl_decoder, val);
9708 val = args[coding_arg_ccl_encoder];
9709 CHECK_CCL_PROGRAM (val);
9710 if (VECTORP (val))
9711 val = Fcopy_sequence (val);
9712 ASET (attrs, coding_attr_ccl_encoder, val);
9714 val = args[coding_arg_ccl_valids];
9715 valids = Fmake_string (make_number (256), make_number (0));
9716 for (tail = val; CONSP (tail); tail = XCDR (tail))
9718 int from, to;
9720 val = XCAR (tail);
9721 if (INTEGERP (val))
9723 if (! (0 <= XINT (val) && XINT (val) <= 255))
9724 args_out_of_range_3 (val, make_number (0), make_number (255));
9725 from = to = XINT (val);
9727 else
9729 CHECK_CONS (val);
9730 CHECK_NATNUM_CAR (val);
9731 CHECK_NUMBER_CDR (val);
9732 if (XINT (XCAR (val)) > 255)
9733 args_out_of_range_3 (XCAR (val),
9734 make_number (0), make_number (255));
9735 from = XINT (XCAR (val));
9736 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9737 args_out_of_range_3 (XCDR (val),
9738 XCAR (val), make_number (255));
9739 to = XINT (XCDR (val));
9741 for (i = from; i <= to; i++)
9742 SSET (valids, i, 1);
9744 ASET (attrs, coding_attr_ccl_valids, valids);
9746 category = coding_category_ccl;
9748 else if (EQ (coding_type, Qutf_16))
9750 Lisp_Object bom, endian;
9752 ASET (attrs, coding_attr_ascii_compat, Qnil);
9754 if (nargs < coding_arg_utf16_max)
9755 goto short_args;
9757 bom = args[coding_arg_utf16_bom];
9758 if (! NILP (bom) && ! EQ (bom, Qt))
9760 CHECK_CONS (bom);
9761 val = XCAR (bom);
9762 CHECK_CODING_SYSTEM (val);
9763 val = XCDR (bom);
9764 CHECK_CODING_SYSTEM (val);
9766 ASET (attrs, coding_attr_utf_bom, bom);
9768 endian = args[coding_arg_utf16_endian];
9769 CHECK_SYMBOL (endian);
9770 if (NILP (endian))
9771 endian = Qbig;
9772 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9773 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9774 ASET (attrs, coding_attr_utf_16_endian, endian);
9776 category = (CONSP (bom)
9777 ? coding_category_utf_16_auto
9778 : NILP (bom)
9779 ? (EQ (endian, Qbig)
9780 ? coding_category_utf_16_be_nosig
9781 : coding_category_utf_16_le_nosig)
9782 : (EQ (endian, Qbig)
9783 ? coding_category_utf_16_be
9784 : coding_category_utf_16_le));
9786 else if (EQ (coding_type, Qiso_2022))
9788 Lisp_Object initial, reg_usage, request, flags;
9790 if (nargs < coding_arg_iso2022_max)
9791 goto short_args;
9793 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9794 CHECK_VECTOR (initial);
9795 for (i = 0; i < 4; i++)
9797 val = Faref (initial, make_number (i));
9798 if (! NILP (val))
9800 struct charset *charset;
9802 CHECK_CHARSET_GET_CHARSET (val, charset);
9803 ASET (initial, i, make_number (CHARSET_ID (charset)));
9804 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9805 ASET (attrs, coding_attr_ascii_compat, Qt);
9807 else
9808 ASET (initial, i, make_number (-1));
9811 reg_usage = args[coding_arg_iso2022_reg_usage];
9812 CHECK_CONS (reg_usage);
9813 CHECK_NUMBER_CAR (reg_usage);
9814 CHECK_NUMBER_CDR (reg_usage);
9816 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9817 for (tail = request; CONSP (tail); tail = XCDR (tail))
9819 int id;
9820 Lisp_Object tmp1;
9822 val = XCAR (tail);
9823 CHECK_CONS (val);
9824 tmp1 = XCAR (val);
9825 CHECK_CHARSET_GET_ID (tmp1, id);
9826 CHECK_NATNUM_CDR (val);
9827 if (XINT (XCDR (val)) >= 4)
9828 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9829 XSETCAR (val, make_number (id));
9832 flags = args[coding_arg_iso2022_flags];
9833 CHECK_NATNUM (flags);
9834 i = XINT (flags) & INT_MAX;
9835 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9836 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9837 flags = make_number (i);
9839 ASET (attrs, coding_attr_iso_initial, initial);
9840 ASET (attrs, coding_attr_iso_usage, reg_usage);
9841 ASET (attrs, coding_attr_iso_request, request);
9842 ASET (attrs, coding_attr_iso_flags, flags);
9843 setup_iso_safe_charsets (attrs);
9845 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9846 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9847 | CODING_ISO_FLAG_SINGLE_SHIFT))
9848 ? coding_category_iso_7_else
9849 : EQ (args[coding_arg_charset_list], Qiso_2022)
9850 ? coding_category_iso_7
9851 : coding_category_iso_7_tight);
9852 else
9854 int id = XINT (AREF (initial, 1));
9856 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9857 || EQ (args[coding_arg_charset_list], Qiso_2022)
9858 || id < 0)
9859 ? coding_category_iso_8_else
9860 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9861 ? coding_category_iso_8_1
9862 : coding_category_iso_8_2);
9864 if (category != coding_category_iso_8_1
9865 && category != coding_category_iso_8_2)
9866 ASET (attrs, coding_attr_ascii_compat, Qnil);
9868 else if (EQ (coding_type, Qemacs_mule))
9870 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9871 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9872 ASET (attrs, coding_attr_ascii_compat, Qt);
9873 category = coding_category_emacs_mule;
9875 else if (EQ (coding_type, Qshift_jis))
9878 struct charset *charset;
9880 if (XINT (Flength (charset_list)) != 3
9881 && XINT (Flength (charset_list)) != 4)
9882 error ("There should be three or four charsets");
9884 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9885 if (CHARSET_DIMENSION (charset) != 1)
9886 error ("Dimension of charset %s is not one",
9887 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9888 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9889 ASET (attrs, coding_attr_ascii_compat, Qt);
9891 charset_list = XCDR (charset_list);
9892 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9893 if (CHARSET_DIMENSION (charset) != 1)
9894 error ("Dimension of charset %s is not one",
9895 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9897 charset_list = XCDR (charset_list);
9898 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9899 if (CHARSET_DIMENSION (charset) != 2)
9900 error ("Dimension of charset %s is not two",
9901 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9903 charset_list = XCDR (charset_list);
9904 if (! NILP (charset_list))
9906 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9907 if (CHARSET_DIMENSION (charset) != 2)
9908 error ("Dimension of charset %s is not two",
9909 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9912 category = coding_category_sjis;
9913 Vsjis_coding_system = name;
9915 else if (EQ (coding_type, Qbig5))
9917 struct charset *charset;
9919 if (XINT (Flength (charset_list)) != 2)
9920 error ("There should be just two charsets");
9922 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9923 if (CHARSET_DIMENSION (charset) != 1)
9924 error ("Dimension of charset %s is not one",
9925 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9926 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9927 ASET (attrs, coding_attr_ascii_compat, Qt);
9929 charset_list = XCDR (charset_list);
9930 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9931 if (CHARSET_DIMENSION (charset) != 2)
9932 error ("Dimension of charset %s is not two",
9933 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9935 category = coding_category_big5;
9936 Vbig5_coding_system = name;
9938 else if (EQ (coding_type, Qraw_text))
9940 category = coding_category_raw_text;
9941 ASET (attrs, coding_attr_ascii_compat, Qt);
9943 else if (EQ (coding_type, Qutf_8))
9945 Lisp_Object bom;
9947 if (nargs < coding_arg_utf8_max)
9948 goto short_args;
9950 bom = args[coding_arg_utf8_bom];
9951 if (! NILP (bom) && ! EQ (bom, Qt))
9953 CHECK_CONS (bom);
9954 val = XCAR (bom);
9955 CHECK_CODING_SYSTEM (val);
9956 val = XCDR (bom);
9957 CHECK_CODING_SYSTEM (val);
9959 ASET (attrs, coding_attr_utf_bom, bom);
9960 if (NILP (bom))
9961 ASET (attrs, coding_attr_ascii_compat, Qt);
9963 category = (CONSP (bom) ? coding_category_utf_8_auto
9964 : NILP (bom) ? coding_category_utf_8_nosig
9965 : coding_category_utf_8_sig);
9967 else if (EQ (coding_type, Qundecided))
9968 category = coding_category_undecided;
9969 else
9970 error ("Invalid coding system type: %s",
9971 SDATA (SYMBOL_NAME (coding_type)));
9973 ASET (attrs, coding_attr_category, make_number (category));
9974 ASET (attrs, coding_attr_plist,
9975 Fcons (QCcategory,
9976 Fcons (AREF (Vcoding_category_table, category),
9977 CODING_ATTR_PLIST (attrs))));
9978 ASET (attrs, coding_attr_plist,
9979 Fcons (QCascii_compatible_p,
9980 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9981 CODING_ATTR_PLIST (attrs))));
9983 eol_type = args[coding_arg_eol_type];
9984 if (! NILP (eol_type)
9985 && ! EQ (eol_type, Qunix)
9986 && ! EQ (eol_type, Qdos)
9987 && ! EQ (eol_type, Qmac))
9988 error ("Invalid eol-type");
9990 aliases = Fcons (name, Qnil);
9992 if (NILP (eol_type))
9994 eol_type = make_subsidiaries (name);
9995 for (i = 0; i < 3; i++)
9997 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9999 this_name = AREF (eol_type, i);
10000 this_aliases = Fcons (this_name, Qnil);
10001 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10002 this_spec = Fmake_vector (make_number (3), attrs);
10003 ASET (this_spec, 1, this_aliases);
10004 ASET (this_spec, 2, this_eol_type);
10005 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10006 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10007 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10008 if (NILP (val))
10009 Vcoding_system_alist
10010 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10011 Vcoding_system_alist);
10015 spec_vec = Fmake_vector (make_number (3), attrs);
10016 ASET (spec_vec, 1, aliases);
10017 ASET (spec_vec, 2, eol_type);
10019 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10020 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10021 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10022 if (NILP (val))
10023 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10024 Vcoding_system_alist);
10027 int id = coding_categories[category].id;
10029 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10030 setup_coding_system (name, &coding_categories[category]);
10033 return Qnil;
10035 short_args:
10036 return Fsignal (Qwrong_number_of_arguments,
10037 Fcons (intern ("define-coding-system-internal"),
10038 make_number (nargs)));
10042 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10043 3, 3, 0,
10044 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10045 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10047 Lisp_Object spec, attrs;
10049 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10050 attrs = AREF (spec, 0);
10051 if (EQ (prop, QCmnemonic))
10053 if (! STRINGP (val))
10054 CHECK_CHARACTER (val);
10055 ASET (attrs, coding_attr_mnemonic, val);
10057 else if (EQ (prop, QCdefault_char))
10059 if (NILP (val))
10060 val = make_number (' ');
10061 else
10062 CHECK_CHARACTER (val);
10063 ASET (attrs, coding_attr_default_char, val);
10065 else if (EQ (prop, QCdecode_translation_table))
10067 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10068 CHECK_SYMBOL (val);
10069 ASET (attrs, coding_attr_decode_tbl, val);
10071 else if (EQ (prop, QCencode_translation_table))
10073 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10074 CHECK_SYMBOL (val);
10075 ASET (attrs, coding_attr_encode_tbl, val);
10077 else if (EQ (prop, QCpost_read_conversion))
10079 CHECK_SYMBOL (val);
10080 ASET (attrs, coding_attr_post_read, val);
10082 else if (EQ (prop, QCpre_write_conversion))
10084 CHECK_SYMBOL (val);
10085 ASET (attrs, coding_attr_pre_write, val);
10087 else if (EQ (prop, QCascii_compatible_p))
10089 ASET (attrs, coding_attr_ascii_compat, val);
10092 ASET (attrs, coding_attr_plist,
10093 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10094 return val;
10098 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10099 Sdefine_coding_system_alias, 2, 2, 0,
10100 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10101 (Lisp_Object alias, Lisp_Object coding_system)
10103 Lisp_Object spec, aliases, eol_type, val;
10105 CHECK_SYMBOL (alias);
10106 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10107 aliases = AREF (spec, 1);
10108 /* ALIASES should be a list of length more than zero, and the first
10109 element is a base coding system. Append ALIAS at the tail of the
10110 list. */
10111 while (!NILP (XCDR (aliases)))
10112 aliases = XCDR (aliases);
10113 XSETCDR (aliases, Fcons (alias, Qnil));
10115 eol_type = AREF (spec, 2);
10116 if (VECTORP (eol_type))
10118 Lisp_Object subsidiaries;
10119 int i;
10121 subsidiaries = make_subsidiaries (alias);
10122 for (i = 0; i < 3; i++)
10123 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10124 AREF (eol_type, i));
10127 Fputhash (alias, spec, Vcoding_system_hash_table);
10128 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10129 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10130 if (NILP (val))
10131 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10132 Vcoding_system_alist);
10134 return Qnil;
10137 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10138 1, 1, 0,
10139 doc: /* Return the base of CODING-SYSTEM.
10140 Any alias or subsidiary coding system is not a base coding system. */)
10141 (Lisp_Object coding_system)
10143 Lisp_Object spec, attrs;
10145 if (NILP (coding_system))
10146 return (Qno_conversion);
10147 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10148 attrs = AREF (spec, 0);
10149 return CODING_ATTR_BASE_NAME (attrs);
10152 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10153 1, 1, 0,
10154 doc: "Return the property list of CODING-SYSTEM.")
10155 (Lisp_Object coding_system)
10157 Lisp_Object spec, attrs;
10159 if (NILP (coding_system))
10160 coding_system = Qno_conversion;
10161 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10162 attrs = AREF (spec, 0);
10163 return CODING_ATTR_PLIST (attrs);
10167 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10168 1, 1, 0,
10169 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10170 (Lisp_Object coding_system)
10172 Lisp_Object spec;
10174 if (NILP (coding_system))
10175 coding_system = Qno_conversion;
10176 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10177 return AREF (spec, 1);
10180 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10181 Scoding_system_eol_type, 1, 1, 0,
10182 doc: /* Return eol-type of CODING-SYSTEM.
10183 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10185 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10186 and CR respectively.
10188 A vector value indicates that a format of end-of-line should be
10189 detected automatically. Nth element of the vector is the subsidiary
10190 coding system whose eol-type is N. */)
10191 (Lisp_Object coding_system)
10193 Lisp_Object spec, eol_type;
10194 int n;
10196 if (NILP (coding_system))
10197 coding_system = Qno_conversion;
10198 if (! CODING_SYSTEM_P (coding_system))
10199 return Qnil;
10200 spec = CODING_SYSTEM_SPEC (coding_system);
10201 eol_type = AREF (spec, 2);
10202 if (VECTORP (eol_type))
10203 return Fcopy_sequence (eol_type);
10204 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10205 return make_number (n);
10208 #endif /* emacs */
10211 /*** 9. Post-amble ***/
10213 void
10214 init_coding_once (void)
10216 int i;
10218 for (i = 0; i < coding_category_max; i++)
10220 coding_categories[i].id = -1;
10221 coding_priorities[i] = i;
10224 /* ISO2022 specific initialize routine. */
10225 for (i = 0; i < 0x20; i++)
10226 iso_code_class[i] = ISO_control_0;
10227 for (i = 0x21; i < 0x7F; i++)
10228 iso_code_class[i] = ISO_graphic_plane_0;
10229 for (i = 0x80; i < 0xA0; i++)
10230 iso_code_class[i] = ISO_control_1;
10231 for (i = 0xA1; i < 0xFF; i++)
10232 iso_code_class[i] = ISO_graphic_plane_1;
10233 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10234 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10235 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10236 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10237 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10238 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10239 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10240 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10241 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10243 for (i = 0; i < 256; i++)
10245 emacs_mule_bytes[i] = 1;
10247 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10248 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10249 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10250 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10253 #ifdef emacs
10255 void
10256 syms_of_coding (void)
10258 staticpro (&Vcoding_system_hash_table);
10260 Lisp_Object args[2];
10261 args[0] = QCtest;
10262 args[1] = Qeq;
10263 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10266 staticpro (&Vsjis_coding_system);
10267 Vsjis_coding_system = Qnil;
10269 staticpro (&Vbig5_coding_system);
10270 Vbig5_coding_system = Qnil;
10272 staticpro (&Vcode_conversion_reused_workbuf);
10273 Vcode_conversion_reused_workbuf = Qnil;
10275 staticpro (&Vcode_conversion_workbuf_name);
10276 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10278 reused_workbuf_in_use = 0;
10280 DEFSYM (Qcharset, "charset");
10281 DEFSYM (Qtarget_idx, "target-idx");
10282 DEFSYM (Qcoding_system_history, "coding-system-history");
10283 Fset (Qcoding_system_history, Qnil);
10285 /* Target FILENAME is the first argument. */
10286 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10287 /* Target FILENAME is the third argument. */
10288 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10290 DEFSYM (Qcall_process, "call-process");
10291 /* Target PROGRAM is the first argument. */
10292 Fput (Qcall_process, Qtarget_idx, make_number (0));
10294 DEFSYM (Qcall_process_region, "call-process-region");
10295 /* Target PROGRAM is the third argument. */
10296 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10298 DEFSYM (Qstart_process, "start-process");
10299 /* Target PROGRAM is the third argument. */
10300 Fput (Qstart_process, Qtarget_idx, make_number (2));
10302 DEFSYM (Qopen_network_stream, "open-network-stream");
10303 /* Target SERVICE is the fourth argument. */
10304 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10306 DEFSYM (Qcoding_system, "coding-system");
10307 DEFSYM (Qcoding_aliases, "coding-aliases");
10309 DEFSYM (Qeol_type, "eol-type");
10310 DEFSYM (Qunix, "unix");
10311 DEFSYM (Qdos, "dos");
10313 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10314 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10315 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10316 DEFSYM (Qdefault_char, "default-char");
10317 DEFSYM (Qundecided, "undecided");
10318 DEFSYM (Qno_conversion, "no-conversion");
10319 DEFSYM (Qraw_text, "raw-text");
10321 DEFSYM (Qiso_2022, "iso-2022");
10323 DEFSYM (Qutf_8, "utf-8");
10324 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10326 #if defined (WINDOWSNT) || defined (CYGWIN)
10327 /* No, not utf-16-le: that one has a BOM. */
10328 DEFSYM (Qutf_16le, "utf-16le");
10329 #endif
10331 DEFSYM (Qutf_16, "utf-16");
10332 DEFSYM (Qbig, "big");
10333 DEFSYM (Qlittle, "little");
10335 DEFSYM (Qshift_jis, "shift-jis");
10336 DEFSYM (Qbig5, "big5");
10338 DEFSYM (Qcoding_system_p, "coding-system-p");
10340 DEFSYM (Qcoding_system_error, "coding-system-error");
10341 Fput (Qcoding_system_error, Qerror_conditions,
10342 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10343 Fput (Qcoding_system_error, Qerror_message,
10344 build_pure_c_string ("Invalid coding system"));
10346 /* Intern this now in case it isn't already done.
10347 Setting this variable twice is harmless.
10348 But don't staticpro it here--that is done in alloc.c. */
10349 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10351 DEFSYM (Qtranslation_table, "translation-table");
10352 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10353 DEFSYM (Qtranslation_table_id, "translation-table-id");
10354 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10355 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10357 DEFSYM (Qvalid_codes, "valid-codes");
10359 DEFSYM (Qemacs_mule, "emacs-mule");
10361 DEFSYM (QCcategory, ":category");
10362 DEFSYM (QCmnemonic, ":mnemonic");
10363 DEFSYM (QCdefault_char, ":default-char");
10364 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10365 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10366 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10367 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10368 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10370 Vcoding_category_table
10371 = Fmake_vector (make_number (coding_category_max), Qnil);
10372 staticpro (&Vcoding_category_table);
10373 /* Followings are target of code detection. */
10374 ASET (Vcoding_category_table, coding_category_iso_7,
10375 intern_c_string ("coding-category-iso-7"));
10376 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10377 intern_c_string ("coding-category-iso-7-tight"));
10378 ASET (Vcoding_category_table, coding_category_iso_8_1,
10379 intern_c_string ("coding-category-iso-8-1"));
10380 ASET (Vcoding_category_table, coding_category_iso_8_2,
10381 intern_c_string ("coding-category-iso-8-2"));
10382 ASET (Vcoding_category_table, coding_category_iso_7_else,
10383 intern_c_string ("coding-category-iso-7-else"));
10384 ASET (Vcoding_category_table, coding_category_iso_8_else,
10385 intern_c_string ("coding-category-iso-8-else"));
10386 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10387 intern_c_string ("coding-category-utf-8-auto"));
10388 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10389 intern_c_string ("coding-category-utf-8"));
10390 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10391 intern_c_string ("coding-category-utf-8-sig"));
10392 ASET (Vcoding_category_table, coding_category_utf_16_be,
10393 intern_c_string ("coding-category-utf-16-be"));
10394 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10395 intern_c_string ("coding-category-utf-16-auto"));
10396 ASET (Vcoding_category_table, coding_category_utf_16_le,
10397 intern_c_string ("coding-category-utf-16-le"));
10398 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10399 intern_c_string ("coding-category-utf-16-be-nosig"));
10400 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10401 intern_c_string ("coding-category-utf-16-le-nosig"));
10402 ASET (Vcoding_category_table, coding_category_charset,
10403 intern_c_string ("coding-category-charset"));
10404 ASET (Vcoding_category_table, coding_category_sjis,
10405 intern_c_string ("coding-category-sjis"));
10406 ASET (Vcoding_category_table, coding_category_big5,
10407 intern_c_string ("coding-category-big5"));
10408 ASET (Vcoding_category_table, coding_category_ccl,
10409 intern_c_string ("coding-category-ccl"));
10410 ASET (Vcoding_category_table, coding_category_emacs_mule,
10411 intern_c_string ("coding-category-emacs-mule"));
10412 /* Followings are NOT target of code detection. */
10413 ASET (Vcoding_category_table, coding_category_raw_text,
10414 intern_c_string ("coding-category-raw-text"));
10415 ASET (Vcoding_category_table, coding_category_undecided,
10416 intern_c_string ("coding-category-undecided"));
10418 DEFSYM (Qinsufficient_source, "insufficient-source");
10419 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10420 DEFSYM (Qinvalid_source, "invalid-source");
10421 DEFSYM (Qinterrupted, "interrupted");
10422 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10423 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10425 defsubr (&Scoding_system_p);
10426 defsubr (&Sread_coding_system);
10427 defsubr (&Sread_non_nil_coding_system);
10428 defsubr (&Scheck_coding_system);
10429 defsubr (&Sdetect_coding_region);
10430 defsubr (&Sdetect_coding_string);
10431 defsubr (&Sfind_coding_systems_region_internal);
10432 defsubr (&Sunencodable_char_position);
10433 defsubr (&Scheck_coding_systems_region);
10434 defsubr (&Sdecode_coding_region);
10435 defsubr (&Sencode_coding_region);
10436 defsubr (&Sdecode_coding_string);
10437 defsubr (&Sencode_coding_string);
10438 defsubr (&Sdecode_sjis_char);
10439 defsubr (&Sencode_sjis_char);
10440 defsubr (&Sdecode_big5_char);
10441 defsubr (&Sencode_big5_char);
10442 defsubr (&Sset_terminal_coding_system_internal);
10443 defsubr (&Sset_safe_terminal_coding_system_internal);
10444 defsubr (&Sterminal_coding_system);
10445 defsubr (&Sset_keyboard_coding_system_internal);
10446 defsubr (&Skeyboard_coding_system);
10447 defsubr (&Sfind_operation_coding_system);
10448 defsubr (&Sset_coding_system_priority);
10449 defsubr (&Sdefine_coding_system_internal);
10450 defsubr (&Sdefine_coding_system_alias);
10451 defsubr (&Scoding_system_put);
10452 defsubr (&Scoding_system_base);
10453 defsubr (&Scoding_system_plist);
10454 defsubr (&Scoding_system_aliases);
10455 defsubr (&Scoding_system_eol_type);
10456 defsubr (&Scoding_system_priority_list);
10458 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10459 doc: /* List of coding systems.
10461 Do not alter the value of this variable manually. This variable should be
10462 updated by the functions `define-coding-system' and
10463 `define-coding-system-alias'. */);
10464 Vcoding_system_list = Qnil;
10466 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10467 doc: /* Alist of coding system names.
10468 Each element is one element list of coding system name.
10469 This variable is given to `completing-read' as COLLECTION argument.
10471 Do not alter the value of this variable manually. This variable should be
10472 updated by the functions `make-coding-system' and
10473 `define-coding-system-alias'. */);
10474 Vcoding_system_alist = Qnil;
10476 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10477 doc: /* List of coding-categories (symbols) ordered by priority.
10479 On detecting a coding system, Emacs tries code detection algorithms
10480 associated with each coding-category one by one in this order. When
10481 one algorithm agrees with a byte sequence of source text, the coding
10482 system bound to the corresponding coding-category is selected.
10484 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10486 int i;
10488 Vcoding_category_list = Qnil;
10489 for (i = coding_category_max - 1; i >= 0; i--)
10490 Vcoding_category_list
10491 = Fcons (AREF (Vcoding_category_table, i),
10492 Vcoding_category_list);
10495 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10496 doc: /* Specify the coding system for read operations.
10497 It is useful to bind this variable with `let', but do not set it globally.
10498 If the value is a coding system, it is used for decoding on read operation.
10499 If not, an appropriate element is used from one of the coding system alists.
10500 There are three such tables: `file-coding-system-alist',
10501 `process-coding-system-alist', and `network-coding-system-alist'. */);
10502 Vcoding_system_for_read = Qnil;
10504 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10505 doc: /* Specify the coding system for write operations.
10506 Programs bind this variable with `let', but you should not set it globally.
10507 If the value is a coding system, it is used for encoding of output,
10508 when writing it to a file and when sending it to a file or subprocess.
10510 If this does not specify a coding system, an appropriate element
10511 is used from one of the coding system alists.
10512 There are three such tables: `file-coding-system-alist',
10513 `process-coding-system-alist', and `network-coding-system-alist'.
10514 For output to files, if the above procedure does not specify a coding system,
10515 the value of `buffer-file-coding-system' is used. */);
10516 Vcoding_system_for_write = Qnil;
10518 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10519 doc: /*
10520 Coding system used in the latest file or process I/O. */);
10521 Vlast_coding_system_used = Qnil;
10523 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10524 doc: /*
10525 Error status of the last code conversion.
10527 When an error was detected in the last code conversion, this variable
10528 is set to one of the following symbols.
10529 `insufficient-source'
10530 `inconsistent-eol'
10531 `invalid-source'
10532 `interrupted'
10533 `insufficient-memory'
10534 When no error was detected, the value doesn't change. So, to check
10535 the error status of a code conversion by this variable, you must
10536 explicitly set this variable to nil before performing code
10537 conversion. */);
10538 Vlast_code_conversion_error = Qnil;
10540 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10541 doc: /*
10542 *Non-nil means always inhibit code conversion of end-of-line format.
10543 See info node `Coding Systems' and info node `Text and Binary' concerning
10544 such conversion. */);
10545 inhibit_eol_conversion = 0;
10547 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10548 doc: /*
10549 Non-nil means process buffer inherits coding system of process output.
10550 Bind it to t if the process output is to be treated as if it were a file
10551 read from some filesystem. */);
10552 inherit_process_coding_system = 0;
10554 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10555 doc: /*
10556 Alist to decide a coding system to use for a file I/O operation.
10557 The format is ((PATTERN . VAL) ...),
10558 where PATTERN is a regular expression matching a file name,
10559 VAL is a coding system, a cons of coding systems, or a function symbol.
10560 If VAL is a coding system, it is used for both decoding and encoding
10561 the file contents.
10562 If VAL is a cons of coding systems, the car part is used for decoding,
10563 and the cdr part is used for encoding.
10564 If VAL is a function symbol, the function must return a coding system
10565 or a cons of coding systems which are used as above. The function is
10566 called with an argument that is a list of the arguments with which
10567 `find-operation-coding-system' was called. If the function can't decide
10568 a coding system, it can return `undecided' so that the normal
10569 code-detection is performed.
10571 See also the function `find-operation-coding-system'
10572 and the variable `auto-coding-alist'. */);
10573 Vfile_coding_system_alist = Qnil;
10575 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10576 doc: /*
10577 Alist to decide a coding system to use for a process I/O operation.
10578 The format is ((PATTERN . VAL) ...),
10579 where PATTERN is a regular expression matching a program name,
10580 VAL is a coding system, a cons of coding systems, or a function symbol.
10581 If VAL is a coding system, it is used for both decoding what received
10582 from the program and encoding what sent to the program.
10583 If VAL is a cons of coding systems, the car part is used for decoding,
10584 and the cdr part is used for encoding.
10585 If VAL is a function symbol, the function must return a coding system
10586 or a cons of coding systems which are used as above.
10588 See also the function `find-operation-coding-system'. */);
10589 Vprocess_coding_system_alist = Qnil;
10591 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10592 doc: /*
10593 Alist to decide a coding system to use for a network I/O operation.
10594 The format is ((PATTERN . VAL) ...),
10595 where PATTERN is a regular expression matching a network service name
10596 or is a port number to connect to,
10597 VAL is a coding system, a cons of coding systems, or a function symbol.
10598 If VAL is a coding system, it is used for both decoding what received
10599 from the network stream and encoding what sent to the network stream.
10600 If VAL is a cons of coding systems, the car part is used for decoding,
10601 and the cdr part is used for encoding.
10602 If VAL is a function symbol, the function must return a coding system
10603 or a cons of coding systems which are used as above.
10605 See also the function `find-operation-coding-system'. */);
10606 Vnetwork_coding_system_alist = Qnil;
10608 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10609 doc: /* Coding system to use with system messages.
10610 Also used for decoding keyboard input on X Window system. */);
10611 Vlocale_coding_system = Qnil;
10613 /* The eol mnemonics are reset in startup.el system-dependently. */
10614 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10615 doc: /*
10616 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10617 eol_mnemonic_unix = build_pure_c_string (":");
10619 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10620 doc: /*
10621 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10622 eol_mnemonic_dos = build_pure_c_string ("\\");
10624 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10625 doc: /*
10626 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10627 eol_mnemonic_mac = build_pure_c_string ("/");
10629 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10630 doc: /*
10631 *String displayed in mode line when end-of-line format is not yet determined. */);
10632 eol_mnemonic_undecided = build_pure_c_string (":");
10634 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10635 doc: /*
10636 *Non-nil enables character translation while encoding and decoding. */);
10637 Venable_character_translation = Qt;
10639 DEFVAR_LISP ("standard-translation-table-for-decode",
10640 Vstandard_translation_table_for_decode,
10641 doc: /* Table for translating characters while decoding. */);
10642 Vstandard_translation_table_for_decode = Qnil;
10644 DEFVAR_LISP ("standard-translation-table-for-encode",
10645 Vstandard_translation_table_for_encode,
10646 doc: /* Table for translating characters while encoding. */);
10647 Vstandard_translation_table_for_encode = Qnil;
10649 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10650 doc: /* Alist of charsets vs revision numbers.
10651 While encoding, if a charset (car part of an element) is found,
10652 designate it with the escape sequence identifying revision (cdr part
10653 of the element). */);
10654 Vcharset_revision_table = Qnil;
10656 DEFVAR_LISP ("default-process-coding-system",
10657 Vdefault_process_coding_system,
10658 doc: /* Cons of coding systems used for process I/O by default.
10659 The car part is used for decoding a process output,
10660 the cdr part is used for encoding a text to be sent to a process. */);
10661 Vdefault_process_coding_system = Qnil;
10663 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10664 doc: /*
10665 Table of extra Latin codes in the range 128..159 (inclusive).
10666 This is a vector of length 256.
10667 If Nth element is non-nil, the existence of code N in a file
10668 \(or output of subprocess) doesn't prevent it to be detected as
10669 a coding system of ISO 2022 variant which has a flag
10670 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10671 or reading output of a subprocess.
10672 Only 128th through 159th elements have a meaning. */);
10673 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10675 DEFVAR_LISP ("select-safe-coding-system-function",
10676 Vselect_safe_coding_system_function,
10677 doc: /*
10678 Function to call to select safe coding system for encoding a text.
10680 If set, this function is called to force a user to select a proper
10681 coding system which can encode the text in the case that a default
10682 coding system used in each operation can't encode the text. The
10683 function should take care that the buffer is not modified while
10684 the coding system is being selected.
10686 The default value is `select-safe-coding-system' (which see). */);
10687 Vselect_safe_coding_system_function = Qnil;
10689 DEFVAR_BOOL ("coding-system-require-warning",
10690 coding_system_require_warning,
10691 doc: /* Internal use only.
10692 If non-nil, on writing a file, `select-safe-coding-system-function' is
10693 called even if `coding-system-for-write' is non-nil. The command
10694 `universal-coding-system-argument' binds this variable to t temporarily. */);
10695 coding_system_require_warning = 0;
10698 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10699 inhibit_iso_escape_detection,
10700 doc: /*
10701 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10703 When Emacs reads text, it tries to detect how the text is encoded.
10704 This code detection is sensitive to escape sequences. If Emacs sees
10705 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10706 of the ISO2022 encodings, and decodes text by the corresponding coding
10707 system (e.g. `iso-2022-7bit').
10709 However, there may be a case that you want to read escape sequences in
10710 a file as is. In such a case, you can set this variable to non-nil.
10711 Then the code detection will ignore any escape sequences, and no text is
10712 detected as encoded in some ISO-2022 encoding. The result is that all
10713 escape sequences become visible in a buffer.
10715 The default value is nil, and it is strongly recommended not to change
10716 it. That is because many Emacs Lisp source files that contain
10717 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10718 in Emacs's distribution, and they won't be decoded correctly on
10719 reading if you suppress escape sequence detection.
10721 The other way to read escape sequences in a file without decoding is
10722 to explicitly specify some coding system that doesn't use ISO-2022
10723 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10724 inhibit_iso_escape_detection = 0;
10726 DEFVAR_BOOL ("inhibit-null-byte-detection",
10727 inhibit_null_byte_detection,
10728 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10729 By default, Emacs treats it as binary data, and does not attempt to
10730 decode it. The effect is as if you specified `no-conversion' for
10731 reading that text.
10733 Set this to non-nil when a regular text happens to include null bytes.
10734 Examples are Index nodes of Info files and null-byte delimited output
10735 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10736 decode text as usual. */);
10737 inhibit_null_byte_detection = 0;
10739 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10740 doc: /* Char table for translating self-inserting characters.
10741 This is applied to the result of input methods, not their input.
10742 See also `keyboard-translate-table'.
10744 Use of this variable for character code unification was rendered
10745 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10746 internal character representation. */);
10747 Vtranslation_table_for_input = Qnil;
10750 Lisp_Object args[coding_arg_max];
10751 Lisp_Object plist[16];
10752 int i;
10754 for (i = 0; i < coding_arg_max; i++)
10755 args[i] = Qnil;
10757 plist[0] = intern_c_string (":name");
10758 plist[1] = args[coding_arg_name] = Qno_conversion;
10759 plist[2] = intern_c_string (":mnemonic");
10760 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10761 plist[4] = intern_c_string (":coding-type");
10762 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10763 plist[6] = intern_c_string (":ascii-compatible-p");
10764 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10765 plist[8] = intern_c_string (":default-char");
10766 plist[9] = args[coding_arg_default_char] = make_number (0);
10767 plist[10] = intern_c_string (":for-unibyte");
10768 plist[11] = args[coding_arg_for_unibyte] = Qt;
10769 plist[12] = intern_c_string (":docstring");
10770 plist[13] = build_pure_c_string ("Do no conversion.\n\
10772 When you visit a file with this coding, the file is read into a\n\
10773 unibyte buffer as is, thus each byte of a file is treated as a\n\
10774 character.");
10775 plist[14] = intern_c_string (":eol-type");
10776 plist[15] = args[coding_arg_eol_type] = Qunix;
10777 args[coding_arg_plist] = Flist (16, plist);
10778 Fdefine_coding_system_internal (coding_arg_max, args);
10780 plist[1] = args[coding_arg_name] = Qundecided;
10781 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10782 plist[5] = args[coding_arg_coding_type] = Qundecided;
10783 /* This is already set.
10784 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10785 plist[8] = intern_c_string (":charset-list");
10786 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10787 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10788 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10789 plist[15] = args[coding_arg_eol_type] = Qnil;
10790 args[coding_arg_plist] = Flist (16, plist);
10791 Fdefine_coding_system_internal (coding_arg_max, args);
10794 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10797 int i;
10799 for (i = 0; i < coding_category_max; i++)
10800 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10802 #if defined (DOS_NT)
10803 system_eol_type = Qdos;
10804 #else
10805 system_eol_type = Qunix;
10806 #endif
10807 staticpro (&system_eol_type);
10810 char *
10811 emacs_strerror (int error_number)
10813 char *str;
10815 synchronize_system_messages_locale ();
10816 str = strerror (error_number);
10818 if (! NILP (Vlocale_coding_system))
10820 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10821 Vlocale_coding_system,
10823 str = SSDATA (dec);
10826 return str;
10829 #endif /* emacs */