1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
156 detect_coding_XXX (struct coding_system
*coding
,
157 struct coding_detection_info
*detect_info
)
159 const unsigned char *src
= coding
->source
;
160 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
161 bool multibytep
= coding
->src_multibyte
;
162 ptrdiff_t consumed_chars
= 0;
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
172 if (! __C_conforms_to_XXX___ (c
))
174 if (! __C_strongly_suggests_XXX__ (c
))
175 found
= CATEGORY_MASK_XXX
;
177 /* The byte sequence is invalid for XXX. */
178 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
182 /* The source exhausted successfully. */
183 detect_info
->found
|= found
;
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
204 decode_coding_XXXX (struct coding_system
*coding
)
206 const unsigned char *src
= coding
->source
+ coding
->consumed
;
207 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base
;
212 /* A buffer to produce decoded characters. */
213 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
214 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
215 bool multibytep
= coding
->src_multibyte
;
220 if (charbuf
< charbuf_end
)
221 /* No more room to produce a decoded character. */
228 if (src_base
< src_end
229 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base
< src_end
&& charbuf
< charbuf_end
)
233 *charbuf
++ = *src_base
++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
237 /* Remember how many characters we produced. */
238 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
261 encode_coding_XXX (struct coding_system
*coding
)
263 bool multibytep
= coding
->dst_multibyte
;
264 int *charbuf
= coding
->charbuf
;
265 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
266 unsigned char *dst
= coding
->destination
+ coding
->produced
;
267 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
268 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
269 ptrdiff_t produced_chars
= 0;
271 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination
:
277 /* How many chars and bytes we produced. */
278 coding
->produced_char
+= produced_chars
;
279 coding
->produced
= dst
- coding
->destination
;
284 /*** 1. Preamble ***/
290 #include "character.h"
294 #include "composite.h"
298 #include "termhooks.h"
300 Lisp_Object Vcoding_system_hash_table
;
302 static Lisp_Object Qcoding_system
, Qeol_type
;
303 static Lisp_Object Qcoding_aliases
;
304 Lisp_Object Qunix
, Qdos
;
305 Lisp_Object Qbuffer_file_coding_system
;
306 static Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
307 static Lisp_Object Qdefault_char
;
308 Lisp_Object Qno_conversion
, Qundecided
;
309 Lisp_Object Qcharset
, Qutf_8
;
310 static Lisp_Object Qiso_2022
;
311 static Lisp_Object Qutf_16
, Qshift_jis
, Qbig5
;
312 static Lisp_Object Qbig
, Qlittle
;
313 static Lisp_Object Qcoding_system_history
;
314 static Lisp_Object Qvalid_codes
;
315 static Lisp_Object QCcategory
, QCmnemonic
, QCdefault_char
;
316 static Lisp_Object QCdecode_translation_table
, QCencode_translation_table
;
317 static Lisp_Object QCpost_read_conversion
, QCpre_write_conversion
;
318 static Lisp_Object QCascii_compatible_p
;
320 Lisp_Object Qcall_process
, Qcall_process_region
;
321 Lisp_Object Qstart_process
, Qopen_network_stream
;
322 static Lisp_Object Qtarget_idx
;
324 static Lisp_Object Qinsufficient_source
, Qinconsistent_eol
, Qinvalid_source
;
325 static Lisp_Object Qinterrupted
, Qinsufficient_memory
;
327 /* If a symbol has this property, evaluate the value to define the
328 symbol as a coding system. */
329 static Lisp_Object Qcoding_system_define_form
;
331 /* Format of end-of-line decided by system. This is Qunix on
332 Unix and Mac, Qdos on DOS/Windows.
333 This has an effect only for external encoding (i.e. for output to
334 file and process), not for in-buffer or Lisp string encoding. */
335 static Lisp_Object system_eol_type
;
339 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
341 /* Coding system emacs-mule and raw-text are for converting only
342 end-of-line format. */
343 Lisp_Object Qemacs_mule
, Qraw_text
;
344 Lisp_Object Qutf_8_emacs
;
346 #if defined (WINDOWSNT) || defined (CYGWIN)
347 static Lisp_Object Qutf_16le
;
350 /* Coding-systems are handed between Emacs Lisp programs and C internal
351 routines by the following three variables. */
352 /* Coding system to be used to encode text for terminal display when
353 terminal coding system is nil. */
354 struct coding_system safe_terminal_coding
;
358 Lisp_Object Qtranslation_table
;
359 Lisp_Object Qtranslation_table_id
;
360 static Lisp_Object Qtranslation_table_for_decode
;
361 static Lisp_Object Qtranslation_table_for_encode
;
363 /* Two special coding systems. */
364 static Lisp_Object Vsjis_coding_system
;
365 static Lisp_Object Vbig5_coding_system
;
367 /* ISO2022 section */
369 #define CODING_ISO_INITIAL(coding, reg) \
370 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
371 coding_attr_iso_initial), \
375 #define CODING_ISO_REQUEST(coding, charset_id) \
376 (((charset_id) <= (coding)->max_charset_id \
377 ? ((coding)->safe_charsets[charset_id] != 255 \
378 ? (coding)->safe_charsets[charset_id] \
383 #define CODING_ISO_FLAGS(coding) \
384 ((coding)->spec.iso_2022.flags)
385 #define CODING_ISO_DESIGNATION(coding, reg) \
386 ((coding)->spec.iso_2022.current_designation[reg])
387 #define CODING_ISO_INVOCATION(coding, plane) \
388 ((coding)->spec.iso_2022.current_invocation[plane])
389 #define CODING_ISO_SINGLE_SHIFTING(coding) \
390 ((coding)->spec.iso_2022.single_shifting)
391 #define CODING_ISO_BOL(coding) \
392 ((coding)->spec.iso_2022.bol)
393 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
394 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
395 #define CODING_ISO_CMP_STATUS(coding) \
396 (&(coding)->spec.iso_2022.cmp_status)
397 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
398 ((coding)->spec.iso_2022.ctext_extended_segment_len)
399 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
400 ((coding)->spec.iso_2022.embedded_utf_8)
402 /* Control characters of ISO2022. */
403 /* code */ /* function */
404 #define ISO_CODE_SO 0x0E /* shift-out */
405 #define ISO_CODE_SI 0x0F /* shift-in */
406 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
407 #define ISO_CODE_ESC 0x1B /* escape */
408 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
409 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
410 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
412 /* All code (1-byte) of ISO2022 is classified into one of the
414 enum iso_code_class_type
416 ISO_control_0
, /* Control codes in the range
417 0x00..0x1F and 0x7F, except for the
418 following 5 codes. */
419 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
420 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
421 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
422 ISO_escape
, /* ISO_CODE_ESC (0x1B) */
423 ISO_control_1
, /* Control codes in the range
424 0x80..0x9F, except for the
425 following 3 codes. */
426 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
427 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
428 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
429 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
430 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
431 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
432 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
436 `iso-flags' attribute of an iso2022 coding system. */
438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
439 instead of the correct short-form sequence (e.g. ESC $ A). */
440 #define CODING_ISO_FLAG_LONG_FORM 0x0001
442 /* If set, reset graphic planes and registers at end-of-line to the
444 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
446 /* If set, reset graphic planes and registers before any control
447 characters to the initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
450 /* If set, encode by 7-bit environment. */
451 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
453 /* If set, use locking-shift function. */
454 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
456 /* If set, use single-shift function. Overwrite
457 CODING_ISO_FLAG_LOCKING_SHIFT. */
458 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
460 /* If set, use designation escape sequence. */
461 #define CODING_ISO_FLAG_DESIGNATION 0x0040
463 /* If set, produce revision number sequence. */
464 #define CODING_ISO_FLAG_REVISION 0x0080
466 /* If set, produce ISO6429's direction specifying sequence. */
467 #define CODING_ISO_FLAG_DIRECTION 0x0100
469 /* If set, assume designation states are reset at beginning of line on
471 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
473 /* If set, designation sequence should be placed at beginning of line
475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
477 /* If set, do not encode unsafe characters on output. */
478 #define CODING_ISO_FLAG_SAFE 0x0800
480 /* If set, extra latin codes (128..159) are accepted as a valid code
482 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
484 #define CODING_ISO_FLAG_COMPOSITION 0x2000
486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
488 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
490 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
492 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
494 /* A character to be produced on output if encoding of the original
495 character is prohibited by CODING_ISO_FLAG_SAFE. */
496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
499 #define CODING_UTF_8_BOM(coding) \
500 ((coding)->spec.utf_8_bom)
503 #define CODING_UTF_16_BOM(coding) \
504 ((coding)->spec.utf_16.bom)
506 #define CODING_UTF_16_ENDIAN(coding) \
507 ((coding)->spec.utf_16.endian)
509 #define CODING_UTF_16_SURROGATE(coding) \
510 ((coding)->spec.utf_16.surrogate)
514 #define CODING_CCL_DECODER(coding) \
515 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
516 #define CODING_CCL_ENCODER(coding) \
517 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
518 #define CODING_CCL_VALIDS(coding) \
519 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
521 /* Index for each coding category in `coding_categories' */
525 coding_category_iso_7
,
526 coding_category_iso_7_tight
,
527 coding_category_iso_8_1
,
528 coding_category_iso_8_2
,
529 coding_category_iso_7_else
,
530 coding_category_iso_8_else
,
531 coding_category_utf_8_auto
,
532 coding_category_utf_8_nosig
,
533 coding_category_utf_8_sig
,
534 coding_category_utf_16_auto
,
535 coding_category_utf_16_be
,
536 coding_category_utf_16_le
,
537 coding_category_utf_16_be_nosig
,
538 coding_category_utf_16_le_nosig
,
539 coding_category_charset
,
540 coding_category_sjis
,
541 coding_category_big5
,
543 coding_category_emacs_mule
,
544 /* All above are targets of code detection. */
545 coding_category_raw_text
,
546 coding_category_undecided
,
550 /* Definitions of flag bits used in detect_coding_XXXX. */
551 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
552 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
553 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
554 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
555 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
556 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
557 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
558 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
559 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
560 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
561 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
562 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
563 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
564 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
565 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
566 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
567 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
568 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
569 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
570 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
572 /* This value is returned if detect_coding_mask () find nothing other
573 than ASCII characters. */
574 #define CATEGORY_MASK_ANY \
575 (CATEGORY_MASK_ISO_7 \
576 | CATEGORY_MASK_ISO_7_TIGHT \
577 | CATEGORY_MASK_ISO_8_1 \
578 | CATEGORY_MASK_ISO_8_2 \
579 | CATEGORY_MASK_ISO_7_ELSE \
580 | CATEGORY_MASK_ISO_8_ELSE \
581 | CATEGORY_MASK_UTF_8_AUTO \
582 | CATEGORY_MASK_UTF_8_NOSIG \
583 | CATEGORY_MASK_UTF_8_SIG \
584 | CATEGORY_MASK_UTF_16_AUTO \
585 | CATEGORY_MASK_UTF_16_BE \
586 | CATEGORY_MASK_UTF_16_LE \
587 | CATEGORY_MASK_UTF_16_BE_NOSIG \
588 | CATEGORY_MASK_UTF_16_LE_NOSIG \
589 | CATEGORY_MASK_CHARSET \
590 | CATEGORY_MASK_SJIS \
591 | CATEGORY_MASK_BIG5 \
592 | CATEGORY_MASK_CCL \
593 | CATEGORY_MASK_EMACS_MULE)
596 #define CATEGORY_MASK_ISO_7BIT \
597 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
599 #define CATEGORY_MASK_ISO_8BIT \
600 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
602 #define CATEGORY_MASK_ISO_ELSE \
603 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
605 #define CATEGORY_MASK_ISO_ESCAPE \
606 (CATEGORY_MASK_ISO_7 \
607 | CATEGORY_MASK_ISO_7_TIGHT \
608 | CATEGORY_MASK_ISO_7_ELSE \
609 | CATEGORY_MASK_ISO_8_ELSE)
611 #define CATEGORY_MASK_ISO \
612 ( CATEGORY_MASK_ISO_7BIT \
613 | CATEGORY_MASK_ISO_8BIT \
614 | CATEGORY_MASK_ISO_ELSE)
616 #define CATEGORY_MASK_UTF_16 \
617 (CATEGORY_MASK_UTF_16_AUTO \
618 | CATEGORY_MASK_UTF_16_BE \
619 | CATEGORY_MASK_UTF_16_LE \
620 | CATEGORY_MASK_UTF_16_BE_NOSIG \
621 | CATEGORY_MASK_UTF_16_LE_NOSIG)
623 #define CATEGORY_MASK_UTF_8 \
624 (CATEGORY_MASK_UTF_8_AUTO \
625 | CATEGORY_MASK_UTF_8_NOSIG \
626 | CATEGORY_MASK_UTF_8_SIG)
628 /* Table of coding categories (Lisp symbols). This variable is for
629 internal use only. */
630 static Lisp_Object Vcoding_category_table
;
632 /* Table of coding-categories ordered by priority. */
633 static enum coding_category coding_priorities
[coding_category_max
];
635 /* Nth element is a coding context for the coding system bound to the
636 Nth coding category. */
637 static struct coding_system coding_categories
[coding_category_max
];
639 /*** Commonly used macros and functions ***/
642 #define min(a, b) ((a) < (b) ? (a) : (b))
645 #define max(a, b) ((a) > (b) ? (a) : (b))
648 #define CODING_GET_INFO(coding, attrs, charset_list) \
650 (attrs) = CODING_ID_ATTRS ((coding)->id); \
651 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
655 /* Safely get one byte from the source text pointed by SRC which ends
656 at SRC_END, and set C to that byte. If there are not enough bytes
657 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
658 and a multibyte character is found at SRC, set C to the
659 negative value of the character code. The caller should declare
660 and set these variables appropriately in advance:
661 src, src_end, multibytep */
663 #define ONE_MORE_BYTE(c) \
665 if (src == src_end) \
667 if (src_base < src) \
668 record_conversion_result \
669 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
670 goto no_more_source; \
673 if (multibytep && (c & 0x80)) \
675 if ((c & 0xFE) == 0xC0) \
676 c = ((c & 1) << 6) | *src++; \
680 c = - string_char (src, &src, NULL); \
681 record_conversion_result \
682 (coding, CODING_RESULT_INVALID_SRC); \
688 /* Safely get two bytes from the source text pointed by SRC which ends
689 at SRC_END, and set C1 and C2 to those bytes while skipping the
690 heading multibyte characters. If there are not enough bytes in the
691 source, it jumps to 'no_more_source'. If MULTIBYTEP and
692 a multibyte character is found for C2, set C2 to the negative value
693 of the character code. The caller should declare and set these
694 variables appropriately in advance:
695 src, src_end, multibytep
696 It is intended that this macro is used in detect_coding_utf_16. */
698 #define TWO_MORE_BYTES(c1, c2) \
701 if (src == src_end) \
702 goto no_more_source; \
704 if (multibytep && (c1 & 0x80)) \
706 if ((c1 & 0xFE) == 0xC0) \
707 c1 = ((c1 & 1) << 6) | *src++; \
710 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
715 if (src == src_end) \
716 goto no_more_source; \
718 if (multibytep && (c2 & 0x80)) \
720 if ((c2 & 0xFE) == 0xC0) \
721 c2 = ((c2 & 1) << 6) | *src++; \
728 /* Store a byte C in the place pointed by DST and increment DST to the
729 next free point, and increment PRODUCED_CHARS. The caller should
730 assure that C is 0..127, and declare and set the variable `dst'
731 appropriately in advance.
735 #define EMIT_ONE_ASCII_BYTE(c) \
742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
744 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
746 produced_chars += 2; \
747 *dst++ = (c1), *dst++ = (c2); \
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
753 store in an appropriate multibyte form. The caller should
754 declare and set the variables `dst' and `multibytep' appropriately
757 #define EMIT_ONE_BYTE(c) \
764 ch = BYTE8_TO_CHAR (ch); \
765 CHAR_STRING_ADVANCE (ch, dst); \
772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
774 #define EMIT_TWO_BYTES(c1, c2) \
776 produced_chars += 2; \
783 ch = BYTE8_TO_CHAR (ch); \
784 CHAR_STRING_ADVANCE (ch, dst); \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
798 #define EMIT_THREE_BYTES(c1, c2, c3) \
800 EMIT_ONE_BYTE (c1); \
801 EMIT_TWO_BYTES (c2, c3); \
805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
807 EMIT_TWO_BYTES (c1, c2); \
808 EMIT_TWO_BYTES (c3, c4); \
813 record_conversion_result (struct coding_system
*coding
,
814 enum coding_result_code result
)
816 coding
->result
= result
;
819 case CODING_RESULT_INSUFFICIENT_SRC
:
820 Vlast_code_conversion_error
= Qinsufficient_source
;
822 case CODING_RESULT_INCONSISTENT_EOL
:
823 Vlast_code_conversion_error
= Qinconsistent_eol
;
825 case CODING_RESULT_INVALID_SRC
:
826 Vlast_code_conversion_error
= Qinvalid_source
;
828 case CODING_RESULT_INTERRUPT
:
829 Vlast_code_conversion_error
= Qinterrupted
;
831 case CODING_RESULT_INSUFFICIENT_MEM
:
832 Vlast_code_conversion_error
= Qinsufficient_memory
;
834 case CODING_RESULT_INSUFFICIENT_DST
:
835 /* Don't record this error in Vlast_code_conversion_error
836 because it happens just temporarily and is resolved when the
837 whole conversion is finished. */
839 case CODING_RESULT_SUCCESS
:
842 Vlast_code_conversion_error
= intern ("Unknown error");
846 /* These wrapper macros are used to preserve validity of pointers into
847 buffer text across calls to decode_char, encode_char, etc, which
848 could cause relocation of buffers if it loads a charset map,
849 because loading a charset map allocates large structures. */
851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
855 charset_map_loaded = 0; \
856 c = DECODE_CHAR (charset, code); \
857 if (charset_map_loaded \
858 && (offset = coding_change_source (coding))) \
861 src_base += offset; \
866 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
870 charset_map_loaded = 0; \
871 code = ENCODE_CHAR (charset, c); \
872 if (charset_map_loaded \
873 && (offset = coding_change_destination (coding))) \
880 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
884 charset_map_loaded = 0; \
885 charset = char_charset (c, charset_list, code_return); \
886 if (charset_map_loaded \
887 && (offset = coding_change_destination (coding))) \
894 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
898 charset_map_loaded = 0; \
899 result = CHAR_CHARSET_P (c, charset); \
900 if (charset_map_loaded \
901 && (offset = coding_change_destination (coding))) \
909 /* If there are at least BYTES length of room at dst, allocate memory
910 for coding->destination and update dst and dst_end. We don't have
911 to take care of coding->source which will be relocated. It is
912 handled by calling coding_set_source in encode_coding. */
914 #define ASSURE_DESTINATION(bytes) \
916 if (dst + (bytes) >= dst_end) \
918 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
920 dst = alloc_destination (coding, more_bytes, dst); \
921 dst_end = coding->destination + coding->dst_bytes; \
926 /* Store multibyte form of the character C in P, and advance P to the
927 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
928 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
929 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
931 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
933 /* Return the character code of character whose multibyte form is at
934 P, and advance P to the end of the multibyte form. This used to be
935 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
936 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
938 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
940 /* Set coding->source from coding->src_object. */
943 coding_set_source (struct coding_system
*coding
)
945 if (BUFFERP (coding
->src_object
))
947 struct buffer
*buf
= XBUFFER (coding
->src_object
);
949 if (coding
->src_pos
< 0)
950 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
952 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
954 else if (STRINGP (coding
->src_object
))
956 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
960 /* Otherwise, the source is C string and is never relocated
961 automatically. Thus we don't have to update anything. */
966 /* Set coding->source from coding->src_object, and return how many
967 bytes coding->source was changed. */
970 coding_change_source (struct coding_system
*coding
)
972 const unsigned char *orig
= coding
->source
;
973 coding_set_source (coding
);
974 return coding
->source
- orig
;
978 /* Set coding->destination from coding->dst_object. */
981 coding_set_destination (struct coding_system
*coding
)
983 if (BUFFERP (coding
->dst_object
))
985 if (BUFFERP (coding
->src_object
) && coding
->src_pos
< 0)
987 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- BEG_BYTE
;
988 coding
->dst_bytes
= (GAP_END_ADDR
989 - (coding
->src_bytes
- coding
->consumed
)
990 - coding
->destination
);
994 /* We are sure that coding->dst_pos_byte is before the gap
996 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
997 + coding
->dst_pos_byte
- BEG_BYTE
);
998 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
999 - coding
->destination
);
1004 /* Otherwise, the destination is C string and is never relocated
1005 automatically. Thus we don't have to update anything. */
1010 /* Set coding->destination from coding->dst_object, and return how
1011 many bytes coding->destination was changed. */
1014 coding_change_destination (struct coding_system
*coding
)
1016 const unsigned char *orig
= coding
->destination
;
1017 coding_set_destination (coding
);
1018 return coding
->destination
- orig
;
1023 coding_alloc_by_realloc (struct coding_system
*coding
, ptrdiff_t bytes
)
1025 if (STRING_BYTES_BOUND
- coding
->dst_bytes
< bytes
)
1027 coding
->destination
= xrealloc (coding
->destination
,
1028 coding
->dst_bytes
+ bytes
);
1029 coding
->dst_bytes
+= bytes
;
1033 coding_alloc_by_making_gap (struct coding_system
*coding
,
1034 ptrdiff_t gap_head_used
, ptrdiff_t bytes
)
1036 if (EQ (coding
->src_object
, coding
->dst_object
))
1038 /* The gap may contain the produced data at the head and not-yet
1039 consumed data at the tail. To preserve those data, we at
1040 first make the gap size to zero, then increase the gap
1042 ptrdiff_t add
= GAP_SIZE
;
1044 GPT
+= gap_head_used
, GPT_BYTE
+= gap_head_used
;
1045 GAP_SIZE
= 0; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
1047 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
1048 GPT
-= gap_head_used
, GPT_BYTE
-= gap_head_used
;
1052 Lisp_Object this_buffer
;
1054 this_buffer
= Fcurrent_buffer ();
1055 set_buffer_internal (XBUFFER (coding
->dst_object
));
1057 set_buffer_internal (XBUFFER (this_buffer
));
1062 static unsigned char *
1063 alloc_destination (struct coding_system
*coding
, ptrdiff_t nbytes
,
1066 ptrdiff_t offset
= dst
- coding
->destination
;
1068 if (BUFFERP (coding
->dst_object
))
1070 struct buffer
*buf
= XBUFFER (coding
->dst_object
);
1072 coding_alloc_by_making_gap (coding
, dst
- BUF_GPT_ADDR (buf
), nbytes
);
1075 coding_alloc_by_realloc (coding
, nbytes
);
1076 coding_set_destination (coding
);
1077 dst
= coding
->destination
+ offset
;
1081 /** Macros for annotations. */
1083 /* An annotation data is stored in the array coding->charbuf in this
1085 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1086 LENGTH is the number of elements in the annotation.
1087 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1088 NCHARS is the number of characters in the text annotated.
1090 The format of the following elements depend on ANNOTATION_MASK.
1092 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1094 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1096 NBYTES is the number of bytes specified in the header part of
1097 old-style emacs-mule encoding, or 0 for the other kind of
1100 METHOD is one of enum composition_method.
1102 Optional COMPOSITION-COMPONENTS are characters and composition
1105 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1108 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109 recover from an invalid annotation, and should be skipped by
1110 produce_annotation. */
1112 /* Maximum length of the header of annotation data. */
1113 #define MAX_ANNOTATION_LENGTH 5
1115 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1117 *(buf)++ = -(len); \
1118 *(buf)++ = (mask); \
1119 *(buf)++ = (nchars); \
1120 coding->annotated = 1; \
1123 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1125 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1131 #define ADD_CHARSET_DATA(buf, nchars, id) \
1133 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1138 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1145 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1146 Return true if a text is encoded in UTF-8. */
1148 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1149 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1150 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1155 #define UTF_8_BOM_1 0xEF
1156 #define UTF_8_BOM_2 0xBB
1157 #define UTF_8_BOM_3 0xBF
1160 detect_coding_utf_8 (struct coding_system
*coding
,
1161 struct coding_detection_info
*detect_info
)
1163 const unsigned char *src
= coding
->source
, *src_base
;
1164 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1165 bool multibytep
= coding
->src_multibyte
;
1166 ptrdiff_t consumed_chars
= 0;
1170 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1171 /* A coding system of this category is always ASCII compatible. */
1172 src
+= coding
->head_ascii
;
1176 int c
, c1
, c2
, c3
, c4
;
1180 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1183 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1185 if (UTF_8_2_OCTET_LEADING_P (c
))
1191 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1193 if (UTF_8_3_OCTET_LEADING_P (c
))
1196 if (src_base
== coding
->source
1197 && c
== UTF_8_BOM_1
&& c1
== UTF_8_BOM_2
&& c2
== UTF_8_BOM_3
)
1202 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1204 if (UTF_8_4_OCTET_LEADING_P (c
))
1210 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1212 if (UTF_8_5_OCTET_LEADING_P (c
))
1219 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1223 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1225 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1230 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1231 detect_info
->found
|= CATEGORY_MASK_UTF_8_SIG
| CATEGORY_MASK_UTF_8_NOSIG
;
1235 detect_info
->rejected
|= CATEGORY_MASK_UTF_8_SIG
;
1237 detect_info
->found
|= CATEGORY_MASK_UTF_8_NOSIG
;
1244 decode_coding_utf_8 (struct coding_system
*coding
)
1246 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1247 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1248 const unsigned char *src_base
;
1249 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1250 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
1251 ptrdiff_t consumed_chars
= 0, consumed_chars_base
= 0;
1252 bool multibytep
= coding
->src_multibyte
;
1253 enum utf_bom_type bom
= CODING_UTF_8_BOM (coding
);
1255 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1256 int byte_after_cr
= -1;
1258 if (bom
!= utf_without_bom
)
1264 if (! UTF_8_3_OCTET_LEADING_P (c1
))
1269 if (! UTF_8_EXTRA_OCTET_P (c2
))
1274 if (! UTF_8_EXTRA_OCTET_P (c3
))
1278 if ((c1
!= UTF_8_BOM_1
)
1279 || (c2
!= UTF_8_BOM_2
) || (c3
!= UTF_8_BOM_3
))
1282 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1287 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1291 int c
, c1
, c2
, c3
, c4
, c5
;
1294 consumed_chars_base
= consumed_chars
;
1296 if (charbuf
>= charbuf_end
)
1298 if (byte_after_cr
>= 0)
1303 if (byte_after_cr
>= 0)
1304 c1
= byte_after_cr
, byte_after_cr
= -1;
1311 else if (UTF_8_1_OCTET_P (c1
))
1313 if (eol_dos
&& c1
== '\r')
1314 ONE_MORE_BYTE (byte_after_cr
);
1320 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1322 if (UTF_8_2_OCTET_LEADING_P (c1
))
1324 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1325 /* Reject overlong sequences here and below. Encoders
1326 producing them are incorrect, they can be misleading,
1327 and they mess up read/write invariance. */
1334 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1336 if (UTF_8_3_OCTET_LEADING_P (c1
))
1338 c
= (((c1
& 0xF) << 12)
1339 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1341 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1347 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1349 if (UTF_8_4_OCTET_LEADING_P (c1
))
1351 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1352 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1359 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1361 if (UTF_8_5_OCTET_LEADING_P (c1
))
1363 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1364 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1366 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1381 consumed_chars
= consumed_chars_base
;
1383 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1388 coding
->consumed_char
+= consumed_chars_base
;
1389 coding
->consumed
= src_base
- coding
->source
;
1390 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1395 encode_coding_utf_8 (struct coding_system
*coding
)
1397 bool multibytep
= coding
->dst_multibyte
;
1398 int *charbuf
= coding
->charbuf
;
1399 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1400 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1401 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1402 ptrdiff_t produced_chars
= 0;
1405 if (CODING_UTF_8_BOM (coding
) == utf_with_bom
)
1407 ASSURE_DESTINATION (3);
1408 EMIT_THREE_BYTES (UTF_8_BOM_1
, UTF_8_BOM_2
, UTF_8_BOM_3
);
1409 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1414 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1416 while (charbuf
< charbuf_end
)
1418 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1420 ASSURE_DESTINATION (safe_room
);
1422 if (CHAR_BYTE8_P (c
))
1424 c
= CHAR_TO_BYTE8 (c
);
1429 CHAR_STRING_ADVANCE_NO_UNIFY (c
, pend
);
1430 for (p
= str
; p
< pend
; p
++)
1437 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1439 while (charbuf
< charbuf_end
)
1441 ASSURE_DESTINATION (safe_room
);
1443 if (CHAR_BYTE8_P (c
))
1444 *dst
++ = CHAR_TO_BYTE8 (c
);
1446 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
1450 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1451 coding
->produced_char
+= produced_chars
;
1452 coding
->produced
= dst
- coding
->destination
;
1457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1458 Return true if a text is encoded in one of UTF-16 based coding systems. */
1460 #define UTF_16_HIGH_SURROGATE_P(val) \
1461 (((val) & 0xFC00) == 0xD800)
1463 #define UTF_16_LOW_SURROGATE_P(val) \
1464 (((val) & 0xFC00) == 0xDC00)
1468 detect_coding_utf_16 (struct coding_system
*coding
,
1469 struct coding_detection_info
*detect_info
)
1471 const unsigned char *src
= coding
->source
;
1472 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1473 bool multibytep
= coding
->src_multibyte
;
1476 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1477 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1478 && (coding
->src_chars
& 1))
1480 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1484 TWO_MORE_BYTES (c1
, c2
);
1485 if ((c1
== 0xFF) && (c2
== 0xFE))
1487 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1488 | CATEGORY_MASK_UTF_16_AUTO
);
1489 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1493 else if ((c1
== 0xFE) && (c2
== 0xFF))
1495 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1496 | CATEGORY_MASK_UTF_16_AUTO
);
1497 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1498 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1503 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1508 /* We check the dispersion of Eth and Oth bytes where E is even and
1509 O is odd. If both are high, we assume binary data.*/
1510 unsigned char e
[256], o
[256];
1511 unsigned e_num
= 1, o_num
= 1;
1518 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_AUTO
1519 |CATEGORY_MASK_UTF_16_BE
1520 | CATEGORY_MASK_UTF_16_LE
);
1522 while ((detect_info
->rejected
& CATEGORY_MASK_UTF_16
)
1523 != CATEGORY_MASK_UTF_16
)
1525 TWO_MORE_BYTES (c1
, c2
);
1533 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1540 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1551 decode_coding_utf_16 (struct coding_system
*coding
)
1553 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1554 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1555 const unsigned char *src_base
;
1556 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1557 /* We may produces at most 3 chars in one loop. */
1558 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
- 2;
1559 ptrdiff_t consumed_chars
= 0, consumed_chars_base
= 0;
1560 bool multibytep
= coding
->src_multibyte
;
1561 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1562 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1563 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1565 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1566 int byte_after_cr1
= -1, byte_after_cr2
= -1;
1568 if (bom
== utf_with_bom
)
1577 if (endian
== utf_16_big_endian
1578 ? c
!= 0xFEFF : c
!= 0xFFFE)
1580 /* The first two bytes are not BOM. Treat them as bytes
1581 for a normal character. */
1585 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1587 else if (bom
== utf_detect_bom
)
1589 /* We have already tried to detect BOM and failed in
1591 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1599 consumed_chars_base
= consumed_chars
;
1601 if (charbuf
>= charbuf_end
)
1603 if (byte_after_cr1
>= 0)
1608 if (byte_after_cr1
>= 0)
1609 c1
= byte_after_cr1
, byte_after_cr1
= -1;
1617 if (byte_after_cr2
>= 0)
1618 c2
= byte_after_cr2
, byte_after_cr2
= -1;
1623 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1627 c
= (endian
== utf_16_big_endian
1628 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1632 if (! UTF_16_LOW_SURROGATE_P (c
))
1634 if (endian
== utf_16_big_endian
)
1635 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1637 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1641 if (UTF_16_HIGH_SURROGATE_P (c
))
1642 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1648 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1649 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1650 *charbuf
++ = 0x10000 + c
;
1655 if (UTF_16_HIGH_SURROGATE_P (c
))
1656 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1659 if (eol_dos
&& c
== '\r')
1661 ONE_MORE_BYTE (byte_after_cr1
);
1662 ONE_MORE_BYTE (byte_after_cr2
);
1670 coding
->consumed_char
+= consumed_chars_base
;
1671 coding
->consumed
= src_base
- coding
->source
;
1672 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1676 encode_coding_utf_16 (struct coding_system
*coding
)
1678 bool multibytep
= coding
->dst_multibyte
;
1679 int *charbuf
= coding
->charbuf
;
1680 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1681 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1682 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1684 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1685 bool big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1686 ptrdiff_t produced_chars
= 0;
1689 if (bom
!= utf_without_bom
)
1691 ASSURE_DESTINATION (safe_room
);
1693 EMIT_TWO_BYTES (0xFE, 0xFF);
1695 EMIT_TWO_BYTES (0xFF, 0xFE);
1696 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1699 while (charbuf
< charbuf_end
)
1701 ASSURE_DESTINATION (safe_room
);
1703 if (c
> MAX_UNICODE_CHAR
)
1704 c
= coding
->default_char
;
1709 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1711 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1718 c1
= (c
>> 10) + 0xD800;
1719 c2
= (c
& 0x3FF) + 0xDC00;
1721 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1723 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1726 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1727 coding
->produced
= dst
- coding
->destination
;
1728 coding
->produced_char
+= produced_chars
;
1733 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1735 /* Emacs' internal format for representation of multiple character
1736 sets is a kind of multi-byte encoding, i.e. characters are
1737 represented by variable-length sequences of one-byte codes.
1739 ASCII characters and control characters (e.g. `tab', `newline') are
1740 represented by one-byte sequences which are their ASCII codes, in
1741 the range 0x00 through 0x7F.
1743 8-bit characters of the range 0x80..0x9F are represented by
1744 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1747 8-bit characters of the range 0xA0..0xFF are represented by
1748 one-byte sequences which are their 8-bit code.
1750 The other characters are represented by a sequence of `base
1751 leading-code', optional `extended leading-code', and one or two
1752 `position-code's. The length of the sequence is determined by the
1753 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1754 whereas extended leading-code and position-code take the range 0xA0
1755 through 0xFF. See `charset.h' for more details about leading-code
1758 --- CODE RANGE of Emacs' internal format ---
1762 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763 eight-bit-graphic 0xA0..0xBF
1764 ELSE 0x81..0x9D + [0xA0..0xFF]+
1765 ---------------------------------------------
1767 As this is the internal character representation, the format is
1768 usually not used externally (i.e. in a file or in a data sent to a
1769 process). But, it is possible to have a text externally in this
1770 format (i.e. by encoding by the coding system `emacs-mule').
1772 In that case, a sequence of one-byte codes has a slightly different
1775 At first, all characters in eight-bit-control are represented by
1776 one-byte sequences which are their 8-bit code.
1778 Next, character composition data are represented by the byte
1779 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1781 METHOD is 0xF2 plus one of composition method (enum
1782 composition_method),
1784 BYTES is 0xA0 plus a byte length of this composition data,
1786 CHARS is 0xA0 plus a number of characters composed by this
1789 COMPONENTs are characters of multibyte form or composition
1790 rules encoded by two-byte of ASCII codes.
1792 In addition, for backward compatibility, the following formats are
1793 also recognized as composition data on decoding.
1796 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1799 MSEQ is a multibyte form but in these special format:
1800 ASCII: 0xA0 ASCII_CODE+0x80,
1801 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802 RULE is a one byte code of the range 0xA0..0xF0 that
1803 represents a composition rule.
1806 char emacs_mule_bytes
[256];
1809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1810 Return true if a text is encoded in 'emacs-mule'. */
1813 detect_coding_emacs_mule (struct coding_system
*coding
,
1814 struct coding_detection_info
*detect_info
)
1816 const unsigned char *src
= coding
->source
, *src_base
;
1817 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1818 bool multibytep
= coding
->src_multibyte
;
1819 ptrdiff_t consumed_chars
= 0;
1823 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1824 /* A coding system of this category is always ASCII compatible. */
1825 src
+= coding
->head_ascii
;
1835 /* Perhaps the start of composite character. We simply skip
1836 it because analyzing it is too heavy for detecting. But,
1837 at least, we check that the composite character
1838 constitutes of more than 4 bytes. */
1839 const unsigned char *src_start
;
1849 if (src
- src_start
<= 4)
1851 found
= CATEGORY_MASK_EMACS_MULE
;
1859 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1864 int more_bytes
= emacs_mule_bytes
[c
] - 1;
1866 while (more_bytes
> 0)
1871 src
--; /* Unread the last byte. */
1876 if (more_bytes
!= 0)
1878 found
= CATEGORY_MASK_EMACS_MULE
;
1881 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1885 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1887 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1890 detect_info
->found
|= found
;
1895 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896 character. If CMP_STATUS indicates that we must expect MSEQ or
1897 RULE described above, decode it and return the negative value of
1898 the decoded character or rule. If an invalid byte is found, return
1899 -1. If SRC is too short, return -2. */
1902 emacs_mule_char (struct coding_system
*coding
, const unsigned char *src
,
1903 int *nbytes
, int *nchars
, int *id
,
1904 struct composition_status
*cmp_status
)
1906 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1907 const unsigned char *src_base
= src
;
1908 bool multibytep
= coding
->src_multibyte
;
1912 int consumed_chars
= 0;
1913 bool mseq_found
= 0;
1919 charset_ID
= emacs_mule_charset
[0];
1925 if (cmp_status
->state
!= COMPOSING_NO
1926 && cmp_status
->old_form
)
1928 if (cmp_status
->state
== COMPOSING_CHAR
)
1943 *nbytes
= src
- src_base
;
1944 *nchars
= consumed_chars
;
1952 switch (emacs_mule_bytes
[c
])
1955 if ((charset_ID
= emacs_mule_charset
[c
]) < 0)
1964 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1965 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1968 if (c
< 0xA0 || (charset_ID
= emacs_mule_charset
[c
]) < 0)
1977 if ((charset_ID
= emacs_mule_charset
[c
]) < 0)
1982 code
= (c
& 0x7F) << 8;
1992 if (c
< 0 || (charset_ID
= emacs_mule_charset
[c
]) < 0)
1997 code
= (c
& 0x7F) << 8;
2006 charset_ID
= ASCII_BYTE_P (code
) ? charset_ascii
: charset_eight_bit
;
2012 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
2013 CHARSET_FROM_ID (charset_ID
), code
, c
);
2017 *nbytes
= src
- src_base
;
2018 *nchars
= consumed_chars
;
2021 return (mseq_found
? -c
: c
);
2031 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2033 /* Handle these composition sequence ('|': the end of header elements,
2034 BYTES and CHARS >= 0xA0):
2036 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2042 (4) relative composition: 0x80 | MSEQ ... MSEQ
2043 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2045 When the starter 0x80 and the following header elements are found,
2046 this annotation header is produced.
2048 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2050 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2053 Then, upon reading the following elements, these codes are produced
2054 until the composition end is found:
2057 (2) ALT ... ALT CHAR ... CHAR
2058 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2060 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2062 When the composition end is found, LENGTH and NCHARS in the
2063 annotation header is updated as below:
2065 (1) LENGTH: unchanged, NCHARS: unchanged
2066 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068 (4) LENGTH: unchanged, NCHARS: number of CHARs
2069 (5) LENGTH: unchanged, NCHARS: number of CHARs
2071 If an error is found while composing, the annotation header is
2072 changed to the original composition header (plus filler -1s) as
2075 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076 (5) [ 0x80 0xFF -1 -1- -1 ]
2078 and the sequence [ -2 DECODED-RULE ] is changed to the original
2079 byte sequence as below:
2080 o the original byte sequence is B: [ B -1 ]
2081 o the original byte sequence is B1 B2: [ B1 B2 ]
2083 Most of the routines are implemented by macros because many
2084 variables and labels in the caller decode_coding_emacs_mule must be
2085 accessible, and they are usually called just once (thus doesn't
2086 increase the size of compiled object). */
2088 /* Decode a composition rule represented by C as a component of
2089 composition sequence of Emacs 20 style. Set RULE to the decoded
2092 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2097 if (c < 0 || c >= 81) \
2098 goto invalid_code; \
2099 gref = c / 9, nref = c % 9; \
2100 if (gref == 4) gref = 10; \
2101 if (nref == 4) nref = 10; \
2102 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2106 /* Decode a composition rule represented by C and the following byte
2107 at SRC as a component of composition sequence of Emacs 21 style.
2108 Set RULE to the decoded rule. */
2110 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2115 if (gref < 0 || gref >= 81) \
2116 goto invalid_code; \
2117 ONE_MORE_BYTE (c); \
2119 if (nref < 0 || nref >= 81) \
2120 goto invalid_code; \
2121 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2125 /* Start of Emacs 21 style format. The first three bytes at SRC are
2126 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127 byte length of this composition information, CHARS is the number of
2128 characters composed by this composition. */
2130 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2132 enum composition_method method = c - 0xF2; \
2133 int nbytes, nchars; \
2135 ONE_MORE_BYTE (c); \
2137 goto invalid_code; \
2138 nbytes = c - 0xA0; \
2139 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2140 goto invalid_code; \
2141 ONE_MORE_BYTE (c); \
2142 nchars = c - 0xA0; \
2143 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2144 goto invalid_code; \
2145 cmp_status->old_form = 0; \
2146 cmp_status->method = method; \
2147 if (method == COMPOSITION_RELATIVE) \
2148 cmp_status->state = COMPOSING_CHAR; \
2150 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2151 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2152 cmp_status->nchars = nchars; \
2153 cmp_status->ncomps = nbytes - 4; \
2154 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2158 /* Start of Emacs 20 style format for relative composition. */
2160 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2162 cmp_status->old_form = 1; \
2163 cmp_status->method = COMPOSITION_RELATIVE; \
2164 cmp_status->state = COMPOSING_CHAR; \
2165 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2166 cmp_status->nchars = cmp_status->ncomps = 0; \
2167 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2171 /* Start of Emacs 20 style format for rule-base composition. */
2173 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2175 cmp_status->old_form = 1; \
2176 cmp_status->method = COMPOSITION_WITH_RULE; \
2177 cmp_status->state = COMPOSING_CHAR; \
2178 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2179 cmp_status->nchars = cmp_status->ncomps = 0; \
2180 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2184 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2186 const unsigned char *current_src = src; \
2188 ONE_MORE_BYTE (c); \
2190 goto invalid_code; \
2191 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2192 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2193 DECODE_EMACS_MULE_21_COMPOSITION (); \
2194 else if (c < 0xA0) \
2195 goto invalid_code; \
2196 else if (c < 0xC0) \
2198 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2199 /* Re-read C as a composition component. */ \
2200 src = current_src; \
2202 else if (c == 0xFF) \
2203 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2205 goto invalid_code; \
2208 #define EMACS_MULE_COMPOSITION_END() \
2210 int idx = - cmp_status->length; \
2212 if (cmp_status->old_form) \
2213 charbuf[idx + 2] = cmp_status->nchars; \
2214 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2215 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2216 cmp_status->state = COMPOSING_NO; \
2221 emacs_mule_finish_composition (int *charbuf
,
2222 struct composition_status
*cmp_status
)
2224 int idx
= - cmp_status
->length
;
2227 if (cmp_status
->old_form
&& cmp_status
->nchars
> 0)
2229 charbuf
[idx
+ 2] = cmp_status
->nchars
;
2231 if (cmp_status
->method
== COMPOSITION_WITH_RULE
2232 && cmp_status
->state
== COMPOSING_CHAR
)
2234 /* The last rule was invalid. */
2235 int rule
= charbuf
[-1] + 0xA0;
2237 charbuf
[-2] = BYTE8_TO_CHAR (rule
);
2244 charbuf
[idx
++] = BYTE8_TO_CHAR (0x80);
2246 if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2248 charbuf
[idx
++] = BYTE8_TO_CHAR (0xFF);
2249 charbuf
[idx
++] = -3;
2255 int nchars
= charbuf
[idx
+ 1] + 0xA0;
2256 int nbytes
= charbuf
[idx
+ 2] + 0xA0;
2258 charbuf
[idx
++] = BYTE8_TO_CHAR (0xF2 + cmp_status
->method
);
2259 charbuf
[idx
++] = BYTE8_TO_CHAR (nbytes
);
2260 charbuf
[idx
++] = BYTE8_TO_CHAR (nchars
);
2261 charbuf
[idx
++] = -1;
2265 cmp_status
->state
= COMPOSING_NO
;
2269 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2271 if (cmp_status->state != COMPOSING_NO) \
2272 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2277 decode_coding_emacs_mule (struct coding_system
*coding
)
2279 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2280 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2281 const unsigned char *src_base
;
2282 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
2283 /* We may produce two annotations (charset and composition) in one
2284 loop and one more charset annotation at the end. */
2286 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3)
2287 /* We can produce up to 2 characters in a loop. */
2289 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
2290 bool multibytep
= coding
->src_multibyte
;
2291 ptrdiff_t char_offset
= coding
->produced_char
;
2292 ptrdiff_t last_offset
= char_offset
;
2293 int last_id
= charset_ascii
;
2295 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
2296 int byte_after_cr
= -1;
2297 struct composition_status
*cmp_status
= &coding
->spec
.emacs_mule
.cmp_status
;
2299 if (cmp_status
->state
!= COMPOSING_NO
)
2303 if (charbuf_end
- charbuf
< cmp_status
->length
)
2305 for (i
= 0; i
< cmp_status
->length
; i
++)
2306 *charbuf
++ = cmp_status
->carryover
[i
];
2307 coding
->annotated
= 1;
2312 int c
, id
IF_LINT (= 0);
2315 consumed_chars_base
= consumed_chars
;
2317 if (charbuf
>= charbuf_end
)
2319 if (byte_after_cr
>= 0)
2324 if (byte_after_cr
>= 0)
2325 c
= byte_after_cr
, byte_after_cr
= -1;
2329 if (c
< 0 || c
== 0x80)
2331 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2338 DECODE_EMACS_MULE_COMPOSITION_START ();
2344 if (eol_dos
&& c
== '\r')
2345 ONE_MORE_BYTE (byte_after_cr
);
2347 if (cmp_status
->state
!= COMPOSING_NO
)
2349 if (cmp_status
->old_form
)
2350 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351 else if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2352 cmp_status
->ncomps
--;
2357 int nchars
IF_LINT (= 0), nbytes
IF_LINT (= 0);
2358 /* emacs_mule_char can load a charset map from a file, which
2359 allocates a large structure and might cause buffer text
2360 to be relocated as result. Thus, we need to remember the
2361 original pointer to buffer text, and fix up all related
2362 pointers after the call. */
2363 const unsigned char *orig
= coding
->source
;
2366 c
= emacs_mule_char (coding
, src_base
, &nbytes
, &nchars
, &id
,
2368 offset
= coding
->source
- orig
;
2382 src
= src_base
+ nbytes
;
2383 consumed_chars
= consumed_chars_base
+ nchars
;
2384 if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2385 cmp_status
->ncomps
-= nchars
;
2388 /* Now if C >= 0, we found a normally encoded character, if C <
2389 0, we found an old-style composition component character or
2392 if (cmp_status
->state
== COMPOSING_NO
)
2396 if (last_id
!= charset_ascii
)
2397 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
,
2400 last_offset
= char_offset
;
2405 else if (cmp_status
->state
== COMPOSING_CHAR
)
2407 if (cmp_status
->old_form
)
2411 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2418 cmp_status
->nchars
++;
2419 cmp_status
->length
++;
2420 if (cmp_status
->nchars
== MAX_COMPOSITION_COMPONENTS
)
2421 EMACS_MULE_COMPOSITION_END ();
2422 else if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2423 cmp_status
->state
= COMPOSING_RULE
;
2429 cmp_status
->length
++;
2430 cmp_status
->nchars
--;
2431 if (cmp_status
->nchars
== 0)
2432 EMACS_MULE_COMPOSITION_END ();
2435 else if (cmp_status
->state
== COMPOSING_RULE
)
2441 EMACS_MULE_COMPOSITION_END ();
2448 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c
, rule
);
2453 cmp_status
->length
+= 2;
2454 cmp_status
->state
= COMPOSING_CHAR
;
2457 else if (cmp_status
->state
== COMPOSING_COMPONENT_CHAR
)
2460 cmp_status
->length
++;
2461 if (cmp_status
->ncomps
== 0)
2462 cmp_status
->state
= COMPOSING_CHAR
;
2463 else if (cmp_status
->ncomps
> 0)
2465 if (cmp_status
->method
== COMPOSITION_WITH_RULE_ALTCHARS
)
2466 cmp_status
->state
= COMPOSING_COMPONENT_RULE
;
2469 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2471 else /* COMPOSING_COMPONENT_RULE */
2475 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c
, rule
);
2480 cmp_status
->length
+= 2;
2481 cmp_status
->ncomps
--;
2482 if (cmp_status
->ncomps
> 0)
2483 cmp_status
->state
= COMPOSING_COMPONENT_CHAR
;
2485 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2490 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2492 consumed_chars
= consumed_chars_base
;
2494 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2500 if (cmp_status
->state
!= COMPOSING_NO
)
2502 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2503 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2508 charbuf
-= cmp_status
->length
;
2509 for (i
= 0; i
< cmp_status
->length
; i
++)
2510 cmp_status
->carryover
[i
] = charbuf
[i
];
2513 if (last_id
!= charset_ascii
)
2514 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
2515 coding
->consumed_char
+= consumed_chars_base
;
2516 coding
->consumed
= src_base
- coding
->source
;
2517 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2521 #define EMACS_MULE_LEADING_CODES(id, codes) \
2524 codes[0] = id, codes[1] = 0; \
2525 else if (id < 0xE0) \
2526 codes[0] = 0x9A, codes[1] = id; \
2527 else if (id < 0xF0) \
2528 codes[0] = 0x9B, codes[1] = id; \
2529 else if (id < 0xF5) \
2530 codes[0] = 0x9C, codes[1] = id; \
2532 codes[0] = 0x9D, codes[1] = id; \
2537 encode_coding_emacs_mule (struct coding_system
*coding
)
2539 bool multibytep
= coding
->dst_multibyte
;
2540 int *charbuf
= coding
->charbuf
;
2541 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2542 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2543 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2545 ptrdiff_t produced_chars
= 0;
2546 Lisp_Object attrs
, charset_list
;
2548 int preferred_charset_id
= -1;
2550 CODING_GET_INFO (coding
, attrs
, charset_list
);
2551 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2553 charset_list
= Vemacs_mule_charset_list
;
2554 ASET (attrs
, coding_attr_charset_list
, charset_list
);
2557 while (charbuf
< charbuf_end
)
2559 ASSURE_DESTINATION (safe_room
);
2564 /* Handle an annotation. */
2567 case CODING_ANNOTATE_COMPOSITION_MASK
:
2568 /* Not yet implemented. */
2570 case CODING_ANNOTATE_CHARSET_MASK
:
2571 preferred_charset_id
= charbuf
[3];
2572 if (preferred_charset_id
>= 0
2573 && NILP (Fmemq (make_number (preferred_charset_id
),
2575 preferred_charset_id
= -1;
2584 if (ASCII_CHAR_P (c
))
2585 EMIT_ONE_ASCII_BYTE (c
);
2586 else if (CHAR_BYTE8_P (c
))
2588 c
= CHAR_TO_BYTE8 (c
);
2593 struct charset
*charset
;
2597 unsigned char leading_codes
[2];
2599 if (preferred_charset_id
>= 0)
2603 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2604 CODING_CHAR_CHARSET_P (coding
, dst
, dst_end
, c
, charset
, result
);
2606 code
= ENCODE_CHAR (charset
, c
);
2608 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2612 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2616 c
= coding
->default_char
;
2617 if (ASCII_CHAR_P (c
))
2619 EMIT_ONE_ASCII_BYTE (c
);
2622 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2625 dimension
= CHARSET_DIMENSION (charset
);
2626 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2627 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2628 EMIT_ONE_BYTE (leading_codes
[0]);
2629 if (leading_codes
[1])
2630 EMIT_ONE_BYTE (leading_codes
[1]);
2632 EMIT_ONE_BYTE (code
| 0x80);
2636 EMIT_ONE_BYTE (code
>> 8);
2637 EMIT_ONE_BYTE (code
& 0xFF);
2641 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2642 coding
->produced_char
+= produced_chars
;
2643 coding
->produced
= dst
- coding
->destination
;
2648 /*** 7. ISO2022 handlers ***/
2650 /* The following note describes the coding system ISO2022 briefly.
2651 Since the intention of this note is to help understand the
2652 functions in this file, some parts are NOT ACCURATE or are OVERLY
2653 SIMPLIFIED. For thorough understanding, please refer to the
2654 original document of ISO2022. This is equivalent to the standard
2655 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2657 ISO2022 provides many mechanisms to encode several character sets
2658 in 7-bit and 8-bit environments. For 7-bit environments, all text
2659 is encoded using bytes less than 128. This may make the encoded
2660 text a little bit longer, but the text passes more easily through
2661 several types of gateway, some of which strip off the MSB (Most
2664 There are two kinds of character sets: control character sets and
2665 graphic character sets. The former contain control characters such
2666 as `newline' and `escape' to provide control functions (control
2667 functions are also provided by escape sequences). The latter
2668 contain graphic characters such as 'A' and '-'. Emacs recognizes
2669 two control character sets and many graphic character sets.
2671 Graphic character sets are classified into one of the following
2672 four classes, according to the number of bytes (DIMENSION) and
2673 number of characters in one dimension (CHARS) of the set:
2674 - DIMENSION1_CHARS94
2675 - DIMENSION1_CHARS96
2676 - DIMENSION2_CHARS94
2677 - DIMENSION2_CHARS96
2679 In addition, each character set is assigned an identification tag,
2680 unique for each set, called the "final character" (denoted as <F>
2681 hereafter). The <F> of each character set is decided by ECMA(*)
2682 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2683 (0x30..0x3F are for private use only).
2685 Note (*): ECMA = European Computer Manufacturers Association
2687 Here are examples of graphic character sets [NAME(<F>)]:
2688 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691 o DIMENSION2_CHARS96 -- none for the moment
2693 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2694 C0 [0x00..0x1F] -- control character plane 0
2695 GL [0x20..0x7F] -- graphic character plane 0
2696 C1 [0x80..0x9F] -- control character plane 1
2697 GR [0xA0..0xFF] -- graphic character plane 1
2699 A control character set is directly designated and invoked to C0 or
2700 C1 by an escape sequence. The most common case is that:
2701 - ISO646's control character set is designated/invoked to C0, and
2702 - ISO6429's control character set is designated/invoked to C1,
2703 and usually these designations/invocations are omitted in encoded
2704 text. In a 7-bit environment, only C0 can be used, and a control
2705 character for C1 is encoded by an appropriate escape sequence to
2706 fit into the environment. All control characters for C1 are
2707 defined to have corresponding escape sequences.
2709 A graphic character set is at first designated to one of four
2710 graphic registers (G0 through G3), then these graphic registers are
2711 invoked to GL or GR. These designations and invocations can be
2712 done independently. The most common case is that G0 is invoked to
2713 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2714 these invocations and designations are omitted in encoded text.
2715 In a 7-bit environment, only GL can be used.
2717 When a graphic character set of CHARS94 is invoked to GL, codes
2718 0x20 and 0x7F of the GL area work as control characters SPACE and
2719 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2722 There are two ways of invocation: locking-shift and single-shift.
2723 With locking-shift, the invocation lasts until the next different
2724 invocation, whereas with single-shift, the invocation affects the
2725 following character only and doesn't affect the locking-shift
2726 state. Invocations are done by the following control characters or
2729 ----------------------------------------------------------------------
2730 abbrev function cntrl escape seq description
2731 ----------------------------------------------------------------------
2732 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2733 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2734 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2735 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2736 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2737 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2738 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2739 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2740 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2741 ----------------------------------------------------------------------
2742 (*) These are not used by any known coding system.
2744 Control characters for these functions are defined by macros
2745 ISO_CODE_XXX in `coding.h'.
2747 Designations are done by the following escape sequences:
2748 ----------------------------------------------------------------------
2749 escape sequence description
2750 ----------------------------------------------------------------------
2751 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2752 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2753 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2754 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2755 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2756 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2757 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2758 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2759 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2760 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2761 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2762 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2763 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2764 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2765 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2766 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2767 ----------------------------------------------------------------------
2769 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2770 of dimension 1, chars 94, and final character <F>, etc...
2772 Note (*): Although these designations are not allowed in ISO2022,
2773 Emacs accepts them on decoding, and produces them on encoding
2774 CHARS96 character sets in a coding system which is characterized as
2775 7-bit environment, non-locking-shift, and non-single-shift.
2777 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2778 '(' must be omitted. We refer to this as "short-form" hereafter.
2780 Now you may notice that there are a lot of ways of encoding the
2781 same multilingual text in ISO2022. Actually, there exist many
2782 coding systems such as Compound Text (used in X11's inter client
2783 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2785 localized platforms), and all of these are variants of ISO2022.
2787 In addition to the above, Emacs handles two more kinds of escape
2788 sequences: ISO6429's direction specification and Emacs' private
2789 sequence for specifying character composition.
2791 ISO6429's direction specification takes the following form:
2792 o CSI ']' -- end of the current direction
2793 o CSI '0' ']' -- end of the current direction
2794 o CSI '1' ']' -- start of left-to-right text
2795 o CSI '2' ']' -- start of right-to-left text
2796 The control character CSI (0x9B: control sequence introducer) is
2797 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2799 Character composition specification takes the following form:
2800 o ESC '0' -- start relative composition
2801 o ESC '1' -- end composition
2802 o ESC '2' -- start rule-base composition (*)
2803 o ESC '3' -- start relative composition with alternate chars (**)
2804 o ESC '4' -- start rule-base composition with alternate chars (**)
2805 Since these are not standard escape sequences of any ISO standard,
2806 the use of them with these meanings is restricted to Emacs only.
2808 (*) This form is used only in Emacs 20.7 and older versions,
2809 but newer versions can safely decode it.
2810 (**) This form is used only in Emacs 21.1 and newer versions,
2811 and older versions can't decode it.
2813 Here's a list of example usages of these composition escape
2814 sequences (categorized by `enum composition_method').
2816 COMPOSITION_RELATIVE:
2817 ESC 0 CHAR [ CHAR ] ESC 1
2818 COMPOSITION_WITH_RULE:
2819 ESC 2 CHAR [ RULE CHAR ] ESC 1
2820 COMPOSITION_WITH_ALTCHARS:
2821 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2822 COMPOSITION_WITH_RULE_ALTCHARS:
2823 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2825 static enum iso_code_class_type iso_code_class
[256];
2827 #define SAFE_CHARSET_P(coding, id) \
2828 ((id) <= (coding)->max_charset_id \
2829 && (coding)->safe_charsets[id] != 255)
2832 setup_iso_safe_charsets (Lisp_Object attrs
)
2834 Lisp_Object charset_list
, safe_charsets
;
2835 Lisp_Object request
;
2836 Lisp_Object reg_usage
;
2838 EMACS_INT reg94
, reg96
;
2839 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2842 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2843 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2844 && ! EQ (charset_list
, Viso_2022_charset_list
))
2846 charset_list
= Viso_2022_charset_list
;
2847 ASET (attrs
, coding_attr_charset_list
, charset_list
);
2848 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2851 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2855 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2857 int id
= XINT (XCAR (tail
));
2858 if (max_charset_id
< id
)
2859 max_charset_id
= id
;
2862 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
2863 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
2864 request
= AREF (attrs
, coding_attr_iso_request
);
2865 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2866 reg94
= XINT (XCAR (reg_usage
));
2867 reg96
= XINT (XCDR (reg_usage
));
2869 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2873 struct charset
*charset
;
2876 charset
= CHARSET_FROM_ID (XINT (id
));
2877 reg
= Fcdr (Fassq (id
, request
));
2879 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2880 else if (charset
->iso_chars_96
)
2883 SSET (safe_charsets
, XINT (id
), reg96
);
2888 SSET (safe_charsets
, XINT (id
), reg94
);
2891 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896 Return true if a text is encoded in one of ISO-2022 based coding
2900 detect_coding_iso_2022 (struct coding_system
*coding
,
2901 struct coding_detection_info
*detect_info
)
2903 const unsigned char *src
= coding
->source
, *src_base
= src
;
2904 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2905 bool multibytep
= coding
->src_multibyte
;
2906 bool single_shifting
= 0;
2909 ptrdiff_t consumed_chars
= 0;
2913 int composition_count
= -1;
2915 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2917 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2919 struct coding_system
*this = &(coding_categories
[i
]);
2920 Lisp_Object attrs
, val
;
2924 attrs
= CODING_ID_ATTRS (this->id
);
2925 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2926 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Viso_2022_charset_list
))
2927 setup_iso_safe_charsets (attrs
);
2928 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2929 this->max_charset_id
= SCHARS (val
) - 1;
2930 this->safe_charsets
= SDATA (val
);
2933 /* A coding system of this category is always ASCII compatible. */
2934 src
+= coding
->head_ascii
;
2936 while (rejected
!= CATEGORY_MASK_ISO
)
2943 if (inhibit_iso_escape_detection
)
2945 single_shifting
= 0;
2947 if (c
== 'N' || c
== 'O')
2949 /* ESC <Fe> for SS2 or SS3. */
2950 single_shifting
= 1;
2951 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2955 /* End of composition. */
2956 if (composition_count
< 0
2957 || composition_count
> MAX_COMPOSITION_COMPONENTS
)
2960 composition_count
= -1;
2961 found
|= CATEGORY_MASK_ISO
;
2963 else if (c
>= '0' && c
<= '4')
2965 /* ESC <Fp> for start/end composition. */
2966 composition_count
= 0;
2970 if (c
>= '(' && c
<= '/')
2972 /* Designation sequence for a charset of dimension 1. */
2974 if (c1
< ' ' || c1
>= 0x80
2975 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2976 /* Invalid designation sequence. Just ignore. */
2981 /* Designation sequence for a charset of dimension 2. */
2983 if (c
>= '@' && c
<= 'B')
2984 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2985 id
= iso_charset_table
[1][0][c
];
2986 else if (c
>= '(' && c
<= '/')
2989 if (c1
< ' ' || c1
>= 0x80
2990 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2991 /* Invalid designation sequence. Just ignore. */
2995 /* Invalid designation sequence. Just ignore it. */
3000 /* Invalid escape sequence. Just ignore it. */
3004 /* We found a valid designation sequence for CHARSET. */
3005 rejected
|= CATEGORY_MASK_ISO_8BIT
;
3006 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
3008 found
|= CATEGORY_MASK_ISO_7
;
3010 rejected
|= CATEGORY_MASK_ISO_7
;
3011 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
3013 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
3015 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
3016 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
3018 found
|= CATEGORY_MASK_ISO_7_ELSE
;
3020 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
3021 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
3023 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3025 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
3031 /* Locking shift out/in. */
3032 if (inhibit_iso_escape_detection
)
3034 single_shifting
= 0;
3035 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
3039 /* Control sequence introducer. */
3040 single_shifting
= 0;
3041 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3042 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3043 goto check_extra_latin
;
3048 if (inhibit_iso_escape_detection
)
3050 single_shifting
= 0;
3051 rejected
|= CATEGORY_MASK_ISO_7BIT
;
3052 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3055 found
|= CATEGORY_MASK_ISO_8_1
;
3056 single_shifting
= 1;
3058 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
3059 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3061 found
|= CATEGORY_MASK_ISO_8_2
;
3062 single_shifting
= 1;
3064 if (single_shifting
)
3066 goto check_extra_latin
;
3073 if (composition_count
>= 0)
3074 composition_count
++;
3075 single_shifting
= 0;
3080 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3081 found
|= CATEGORY_MASK_ISO_8_1
;
3082 /* Check the length of succeeding codes of the range
3083 0xA0..0FF. If the byte length is even, we include
3084 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3085 only when we are not single shifting. */
3086 if (! single_shifting
3087 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
3090 while (src
< src_end
)
3102 if (len
& 1 && src
< src_end
)
3104 rejected
|= CATEGORY_MASK_ISO_8_2
;
3105 if (composition_count
>= 0)
3106 composition_count
+= len
;
3110 found
|= CATEGORY_MASK_ISO_8_2
;
3111 if (composition_count
>= 0)
3112 composition_count
+= len
/ 2;
3118 if (! VECTORP (Vlatin_extra_code_table
)
3119 || NILP (AREF (Vlatin_extra_code_table
, c
)))
3121 rejected
= CATEGORY_MASK_ISO
;
3124 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3125 & CODING_ISO_FLAG_LATIN_EXTRA
)
3126 found
|= CATEGORY_MASK_ISO_8_1
;
3128 rejected
|= CATEGORY_MASK_ISO_8_1
;
3129 rejected
|= CATEGORY_MASK_ISO_8_2
;
3133 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
3137 detect_info
->rejected
|= rejected
;
3138 detect_info
->found
|= (found
& ~rejected
);
3143 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3144 escape sequence should be kept. */
3145 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3149 if (final < '0' || final >= 128 \
3150 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3151 || !SAFE_CHARSET_P (coding, id)) \
3153 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3157 prev = CODING_ISO_DESIGNATION (coding, reg); \
3158 if (id == charset_jisx0201_roman) \
3160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3161 id = charset_ascii; \
3163 else if (id == charset_jisx0208_1978) \
3165 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3166 id = charset_jisx0208; \
3168 CODING_ISO_DESIGNATION (coding, reg) = id; \
3169 /* If there was an invalid designation to REG previously, and this \
3170 designation is ASCII to REG, we should keep this designation \
3172 if (prev == -2 && id == charset_ascii) \
3177 /* Handle these composition sequence (ALT: alternate char):
3179 (1) relative composition: ESC 0 CHAR ... ESC 1
3180 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3181 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3182 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3184 When the start sequence (ESC 0/2/3/4) is found, this annotation
3187 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3189 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3190 produced until the end sequence (ESC 1) is found:
3193 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3194 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3195 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3197 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3198 annotation header is updated as below:
3200 (1) LENGTH: unchanged, NCHARS: number of CHARs
3201 (2) LENGTH: unchanged, NCHARS: number of CHARs
3202 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3203 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3205 If an error is found while composing, the annotation header is
3208 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3210 and the sequence [ -2 DECODED-RULE ] is changed to the original
3211 byte sequence as below:
3212 o the original byte sequence is B: [ B -1 ]
3213 o the original byte sequence is B1 B2: [ B1 B2 ]
3214 and the sequence [ -1 -1 ] is changed to the original byte
3219 /* Decode a composition rule C1 and maybe one more byte from the
3220 source, and set RULE to the encoded composition rule. If the rule
3221 is invalid, goto invalid_code. */
3223 #define DECODE_COMPOSITION_RULE(rule) \
3227 goto invalid_code; \
3228 if (rule < 81) /* old format (before ver.21) */ \
3230 int gref = (rule) / 9; \
3231 int nref = (rule) % 9; \
3232 if (gref == 4) gref = 10; \
3233 if (nref == 4) nref = 10; \
3234 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3236 else /* new format (after ver.21) */ \
3240 ONE_MORE_BYTE (b); \
3241 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3242 goto invalid_code; \
3243 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3244 rule += 0x100; /* Distinguish it from the old format. */ \
3248 #define ENCODE_COMPOSITION_RULE(rule) \
3250 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3252 if (rule < 0x100) /* old format */ \
3254 if (gref == 10) gref = 4; \
3255 if (nref == 10) nref = 4; \
3256 charbuf[idx] = 32 + gref * 9 + nref; \
3257 charbuf[idx + 1] = -1; \
3260 else /* new format */ \
3262 charbuf[idx] = 32 + 81 + gref; \
3263 charbuf[idx + 1] = 32 + nref; \
3268 /* Finish the current composition as invalid. */
3271 finish_composition (int *charbuf
, struct composition_status
*cmp_status
)
3273 int idx
= - cmp_status
->length
;
3276 /* Recover the original ESC sequence */
3277 charbuf
[idx
++] = ISO_CODE_ESC
;
3278 charbuf
[idx
++] = (cmp_status
->method
== COMPOSITION_RELATIVE
? '0'
3279 : cmp_status
->method
== COMPOSITION_WITH_RULE
? '2'
3280 : cmp_status
->method
== COMPOSITION_WITH_ALTCHARS
? '3'
3281 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3283 charbuf
[idx
++] = -2;
3285 charbuf
[idx
++] = -1;
3286 new_chars
= cmp_status
->nchars
;
3287 if (cmp_status
->method
>= COMPOSITION_WITH_RULE
)
3288 for (; idx
< 0; idx
++)
3290 int elt
= charbuf
[idx
];
3294 ENCODE_COMPOSITION_RULE (charbuf
[idx
+ 1]);
3299 charbuf
[idx
++] = ISO_CODE_ESC
;
3304 cmp_status
->state
= COMPOSING_NO
;
3308 /* If characters are under composition, finish the composition. */
3309 #define MAYBE_FINISH_COMPOSITION() \
3311 if (cmp_status->state != COMPOSING_NO) \
3312 char_offset += finish_composition (charbuf, cmp_status); \
3315 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3317 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3318 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3319 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3320 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3322 Produce this annotation sequence now:
3324 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3327 #define DECODE_COMPOSITION_START(c1) \
3330 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3331 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3332 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3333 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3337 cmp_status->state = COMPOSING_CHAR; \
3338 cmp_status->length += 2; \
3342 MAYBE_FINISH_COMPOSITION (); \
3343 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3344 : c1 == '2' ? COMPOSITION_WITH_RULE \
3345 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3346 : COMPOSITION_WITH_RULE_ALTCHARS); \
3348 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3349 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3350 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3351 cmp_status->nchars = cmp_status->ncomps = 0; \
3352 coding->annotated = 1; \
3357 /* Handle composition end sequence ESC 1. */
3359 #define DECODE_COMPOSITION_END() \
3361 if (cmp_status->nchars == 0 \
3362 || ((cmp_status->state == COMPOSING_CHAR) \
3363 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3365 MAYBE_FINISH_COMPOSITION (); \
3366 goto invalid_code; \
3368 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3369 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3370 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3371 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3372 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3373 char_offset += cmp_status->nchars; \
3374 cmp_status->state = COMPOSING_NO; \
3377 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3379 #define STORE_COMPOSITION_RULE(rule) \
3382 *charbuf++ = rule; \
3383 cmp_status->length += 2; \
3384 cmp_status->state--; \
3387 /* Store a composed char or a component char C in charbuf, and update
3390 #define STORE_COMPOSITION_CHAR(c) \
3393 cmp_status->length++; \
3394 if (cmp_status->state == COMPOSING_CHAR) \
3395 cmp_status->nchars++; \
3397 cmp_status->ncomps++; \
3398 if (cmp_status->method == COMPOSITION_WITH_RULE \
3399 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3400 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3401 cmp_status->state++; \
3405 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3408 decode_coding_iso_2022 (struct coding_system
*coding
)
3410 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3411 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3412 const unsigned char *src_base
;
3413 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
3414 /* We may produce two annotations (charset and composition) in one
3415 loop and one more charset annotation at the end. */
3417 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3);
3418 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
3419 bool multibytep
= coding
->src_multibyte
;
3420 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3421 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3422 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3423 int charset_id_2
, charset_id_3
;
3424 struct charset
*charset
;
3426 struct composition_status
*cmp_status
= CODING_ISO_CMP_STATUS (coding
);
3427 Lisp_Object attrs
= CODING_ID_ATTRS (coding
->id
);
3428 ptrdiff_t char_offset
= coding
->produced_char
;
3429 ptrdiff_t last_offset
= char_offset
;
3430 int last_id
= charset_ascii
;
3432 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
3433 int byte_after_cr
= -1;
3436 setup_iso_safe_charsets (attrs
);
3437 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
3439 if (cmp_status
->state
!= COMPOSING_NO
)
3441 if (charbuf_end
- charbuf
< cmp_status
->length
)
3443 for (i
= 0; i
< cmp_status
->length
; i
++)
3444 *charbuf
++ = cmp_status
->carryover
[i
];
3445 coding
->annotated
= 1;
3453 consumed_chars_base
= consumed_chars
;
3455 if (charbuf
>= charbuf_end
)
3457 if (byte_after_cr
>= 0)
3462 if (byte_after_cr
>= 0)
3463 c1
= byte_after_cr
, byte_after_cr
= -1;
3469 if (CODING_ISO_EXTSEGMENT_LEN (coding
) > 0)
3471 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3473 CODING_ISO_EXTSEGMENT_LEN (coding
)--;
3477 if (CODING_ISO_EMBEDDED_UTF_8 (coding
))
3479 if (c1
== ISO_CODE_ESC
)
3481 if (src
+ 1 >= src_end
)
3482 goto no_more_source
;
3483 *charbuf
++ = ISO_CODE_ESC
;
3485 if (src
[0] == '%' && src
[1] == '@')
3488 consumed_chars
+= 2;
3490 /* We are sure charbuf can contain two more chars. */
3493 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
3498 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3504 if ((cmp_status
->state
== COMPOSING_RULE
3505 || cmp_status
->state
== COMPOSING_COMPONENT_RULE
)
3506 && c1
!= ISO_CODE_ESC
)
3510 DECODE_COMPOSITION_RULE (rule
);
3511 STORE_COMPOSITION_RULE (rule
);
3515 /* We produce at most one character. */
3516 switch (iso_code_class
[c1
])
3518 case ISO_0x20_or_0x7F
:
3519 if (charset_id_0
< 0
3520 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
3521 /* This is SPACE or DEL. */
3522 charset
= CHARSET_FROM_ID (charset_ascii
);
3524 charset
= CHARSET_FROM_ID (charset_id_0
);
3527 case ISO_graphic_plane_0
:
3528 if (charset_id_0
< 0)
3529 charset
= CHARSET_FROM_ID (charset_ascii
);
3531 charset
= CHARSET_FROM_ID (charset_id_0
);
3534 case ISO_0xA0_or_0xFF
:
3535 if (charset_id_1
< 0
3536 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
3537 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3539 /* This is a graphic character, we fall down ... */
3541 case ISO_graphic_plane_1
:
3542 if (charset_id_1
< 0)
3544 charset
= CHARSET_FROM_ID (charset_id_1
);
3548 if (eol_dos
&& c1
== '\r')
3549 ONE_MORE_BYTE (byte_after_cr
);
3550 MAYBE_FINISH_COMPOSITION ();
3551 charset
= CHARSET_FROM_ID (charset_ascii
);
3558 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3559 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
3561 CODING_ISO_INVOCATION (coding
, 0) = 1;
3562 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3566 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
3568 CODING_ISO_INVOCATION (coding
, 0) = 0;
3569 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3572 case ISO_single_shift_2_7
:
3573 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
))
3575 case ISO_single_shift_2
:
3576 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3578 /* SS2 is handled as an escape sequence of ESC 'N' */
3580 goto label_escape_sequence
;
3582 case ISO_single_shift_3
:
3583 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3585 /* SS2 is handled as an escape sequence of ESC 'O' */
3587 goto label_escape_sequence
;
3589 case ISO_control_sequence_introducer
:
3590 /* CSI is handled as an escape sequence of ESC '[' ... */
3592 goto label_escape_sequence
;
3596 label_escape_sequence
:
3597 /* Escape sequences handled here are invocation,
3598 designation, direction specification, and character
3599 composition specification. */
3602 case '&': /* revision of following character set */
3604 if (!(c1
>= '@' && c1
<= '~'))
3607 if (c1
!= ISO_CODE_ESC
)
3610 goto label_escape_sequence
;
3612 case '$': /* designation of 2-byte character set */
3613 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3619 if (c1
>= '@' && c1
<= 'B')
3620 { /* designation of JISX0208.1978, GB2312.1980,
3622 reg
= 0, chars96
= 0;
3624 else if (c1
>= 0x28 && c1
<= 0x2B)
3625 { /* designation of DIMENSION2_CHARS94 character set */
3626 reg
= c1
- 0x28, chars96
= 0;
3629 else if (c1
>= 0x2C && c1
<= 0x2F)
3630 { /* designation of DIMENSION2_CHARS96 character set */
3631 reg
= c1
- 0x2C, chars96
= 1;
3636 DECODE_DESIGNATION (reg
, 2, chars96
, c1
);
3637 /* We must update these variables now. */
3639 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3641 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3647 case 'n': /* invocation of locking-shift-2 */
3648 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3649 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3651 CODING_ISO_INVOCATION (coding
, 0) = 2;
3652 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3655 case 'o': /* invocation of locking-shift-3 */
3656 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3657 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3659 CODING_ISO_INVOCATION (coding
, 0) = 3;
3660 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3663 case 'N': /* invocation of single-shift-2 */
3664 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3665 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3667 charset_id_2
= CODING_ISO_DESIGNATION (coding
, 2);
3668 if (charset_id_2
< 0)
3669 charset
= CHARSET_FROM_ID (charset_ascii
);
3671 charset
= CHARSET_FROM_ID (charset_id_2
);
3673 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3677 case 'O': /* invocation of single-shift-3 */
3678 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3679 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3681 charset_id_3
= CODING_ISO_DESIGNATION (coding
, 3);
3682 if (charset_id_3
< 0)
3683 charset
= CHARSET_FROM_ID (charset_ascii
);
3685 charset
= CHARSET_FROM_ID (charset_id_3
);
3687 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3691 case '0': case '2': case '3': case '4': /* start composition */
3692 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3694 if (last_id
!= charset_ascii
)
3696 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3697 last_id
= charset_ascii
;
3698 last_offset
= char_offset
;
3700 DECODE_COMPOSITION_START (c1
);
3703 case '1': /* end composition */
3704 if (cmp_status
->state
== COMPOSING_NO
)
3706 DECODE_COMPOSITION_END ();
3709 case '[': /* specification of direction */
3710 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
))
3712 /* For the moment, nested direction is not supported.
3713 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3714 left-to-right, and nonzero means right-to-left. */
3718 case ']': /* end of the current direction */
3719 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3721 case '0': /* end of the current direction */
3722 case '1': /* start of left-to-right direction */
3725 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3730 case '2': /* start of right-to-left direction */
3733 coding
->mode
|= CODING_MODE_DIRECTION
;
3747 /* CTEXT extended segment:
3748 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3749 We keep these bytes as is for the moment.
3750 They may be decoded by post-read-conversion. */
3754 ONE_MORE_BYTE (dim
);
3755 if (dim
< '0' || dim
> '4')
3763 size
= ((M
- 128) * 128) + (L
- 128);
3764 if (charbuf
+ 6 > charbuf_end
)
3766 *charbuf
++ = ISO_CODE_ESC
;
3770 *charbuf
++ = BYTE8_TO_CHAR (M
);
3771 *charbuf
++ = BYTE8_TO_CHAR (L
);
3772 CODING_ISO_EXTSEGMENT_LEN (coding
) = size
;
3776 /* XFree86 extension for embedding UTF-8 in CTEXT:
3777 ESC % G --UTF-8-BYTES-- ESC % @
3778 We keep these bytes as is for the moment.
3779 They may be decoded by post-read-conversion. */
3780 if (charbuf
+ 3 > charbuf_end
)
3782 *charbuf
++ = ISO_CODE_ESC
;
3785 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 1;
3793 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3798 if (c1
>= 0x28 && c1
<= 0x2B)
3799 { /* designation of DIMENSION1_CHARS94 character set */
3800 reg
= c1
- 0x28, chars96
= 0;
3803 else if (c1
>= 0x2C && c1
<= 0x2F)
3804 { /* designation of DIMENSION1_CHARS96 character set */
3805 reg
= c1
- 0x2C, chars96
= 1;
3810 DECODE_DESIGNATION (reg
, 1, chars96
, c1
);
3811 /* We must update these variables now. */
3813 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3815 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3827 if (cmp_status
->state
== COMPOSING_NO
3828 && charset
->id
!= charset_ascii
3829 && last_id
!= charset
->id
)
3831 if (last_id
!= charset_ascii
)
3832 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3833 last_id
= charset
->id
;
3834 last_offset
= char_offset
;
3837 /* Now we know CHARSET and 1st position code C1 of a character.
3838 Produce a decoded character while getting 2nd and 3rd
3839 position codes C2, C3 if necessary. */
3840 if (CHARSET_DIMENSION (charset
) > 1)
3843 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0)
3844 || ((c1
& 0x80) != (c2
& 0x80)))
3845 /* C2 is not in a valid range. */
3847 if (CHARSET_DIMENSION (charset
) == 2)
3848 c1
= (c1
<< 8) | c2
;
3852 if (c3
< 0x20 || (c3
>= 0x80 && c3
< 0xA0)
3853 || ((c1
& 0x80) != (c3
& 0x80)))
3854 /* C3 is not in a valid range. */
3856 c1
= (c1
<< 16) | (c2
<< 8) | c2
;
3860 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3863 MAYBE_FINISH_COMPOSITION ();
3864 for (; src_base
< src
; src_base
++, char_offset
++)
3866 if (ASCII_BYTE_P (*src_base
))
3867 *charbuf
++ = *src_base
;
3869 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3872 else if (cmp_status
->state
== COMPOSING_NO
)
3877 else if ((cmp_status
->state
== COMPOSING_CHAR
3878 ? cmp_status
->nchars
3879 : cmp_status
->ncomps
)
3880 >= MAX_COMPOSITION_COMPONENTS
)
3882 /* Too long composition. */
3883 MAYBE_FINISH_COMPOSITION ();
3888 STORE_COMPOSITION_CHAR (c
);
3892 MAYBE_FINISH_COMPOSITION ();
3894 consumed_chars
= consumed_chars_base
;
3896 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3906 if (cmp_status
->state
!= COMPOSING_NO
)
3908 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
3909 MAYBE_FINISH_COMPOSITION ();
3912 charbuf
-= cmp_status
->length
;
3913 for (i
= 0; i
< cmp_status
->length
; i
++)
3914 cmp_status
->carryover
[i
] = charbuf
[i
];
3917 else if (last_id
!= charset_ascii
)
3918 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3919 coding
->consumed_char
+= consumed_chars_base
;
3920 coding
->consumed
= src_base
- coding
->source
;
3921 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3925 /* ISO2022 encoding stuff. */
3928 It is not enough to say just "ISO2022" on encoding, we have to
3929 specify more details. In Emacs, each coding system of ISO2022
3930 variant has the following specifications:
3931 1. Initial designation to G0 thru G3.
3932 2. Allows short-form designation?
3933 3. ASCII should be designated to G0 before control characters?
3934 4. ASCII should be designated to G0 at end of line?
3935 5. 7-bit environment or 8-bit environment?
3936 6. Use locking-shift?
3937 7. Use Single-shift?
3938 And the following two are only for Japanese:
3939 8. Use ASCII in place of JIS0201-1976-Roman?
3940 9. Use JISX0208-1983 in place of JISX0208-1978?
3941 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3942 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3946 /* Produce codes (escape sequence) for designating CHARSET to graphic
3947 register REG at DST, and increment DST. If <final-char> of CHARSET is
3948 '@', 'A', or 'B' and the coding system CODING allows, produce
3949 designation sequence of short-form. */
3951 #define ENCODE_DESIGNATION(charset, reg, coding) \
3953 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3954 const char *intermediate_char_94 = "()*+"; \
3955 const char *intermediate_char_96 = ",-./"; \
3956 int revision = -1; \
3958 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3959 revision = CHARSET_ISO_REVISION (charset); \
3961 if (revision >= 0) \
3963 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3964 EMIT_ONE_BYTE ('@' + revision); \
3966 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3967 if (CHARSET_DIMENSION (charset) == 1) \
3970 if (! CHARSET_ISO_CHARS_96 (charset)) \
3971 b = intermediate_char_94[reg]; \
3973 b = intermediate_char_96[reg]; \
3974 EMIT_ONE_ASCII_BYTE (b); \
3978 EMIT_ONE_ASCII_BYTE ('$'); \
3979 if (! CHARSET_ISO_CHARS_96 (charset)) \
3981 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3983 || final_char < '@' || final_char > 'B') \
3984 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3987 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3989 EMIT_ONE_ASCII_BYTE (final_char); \
3991 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3995 /* The following two macros produce codes (control character or escape
3996 sequence) for ISO2022 single-shift functions (single-shift-2 and
3999 #define ENCODE_SINGLE_SHIFT_2 \
4001 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4002 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4004 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4005 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4009 #define ENCODE_SINGLE_SHIFT_3 \
4011 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4012 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4014 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4015 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4019 /* The following four macros produce codes (control character or
4020 escape sequence) for ISO2022 locking-shift functions (shift-in,
4021 shift-out, locking-shift-2, and locking-shift-3). */
4023 #define ENCODE_SHIFT_IN \
4025 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4026 CODING_ISO_INVOCATION (coding, 0) = 0; \
4030 #define ENCODE_SHIFT_OUT \
4032 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4033 CODING_ISO_INVOCATION (coding, 0) = 1; \
4037 #define ENCODE_LOCKING_SHIFT_2 \
4039 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4040 CODING_ISO_INVOCATION (coding, 0) = 2; \
4044 #define ENCODE_LOCKING_SHIFT_3 \
4046 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4047 CODING_ISO_INVOCATION (coding, 0) = 3; \
4051 /* Produce codes for a DIMENSION1 character whose character set is
4052 CHARSET and whose position-code is C1. Designation and invocation
4053 sequences are also produced in advance if necessary. */
4055 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4057 int id = CHARSET_ID (charset); \
4059 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4060 && id == charset_ascii) \
4062 id = charset_jisx0201_roman; \
4063 charset = CHARSET_FROM_ID (id); \
4066 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4068 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4069 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4071 EMIT_ONE_BYTE (c1 | 0x80); \
4072 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4075 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4077 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4080 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4082 EMIT_ONE_BYTE (c1 | 0x80); \
4086 /* Since CHARSET is not yet invoked to any graphic planes, we \
4087 must invoke it, or, at first, designate it to some graphic \
4088 register. Then repeat the loop to actually produce the \
4090 dst = encode_invocation_designation (charset, coding, dst, \
4095 /* Produce codes for a DIMENSION2 character whose character set is
4096 CHARSET and whose position-codes are C1 and C2. Designation and
4097 invocation codes are also produced in advance if necessary. */
4099 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4101 int id = CHARSET_ID (charset); \
4103 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4104 && id == charset_jisx0208) \
4106 id = charset_jisx0208_1978; \
4107 charset = CHARSET_FROM_ID (id); \
4110 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4113 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4115 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4116 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4119 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4121 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4124 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4126 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4130 /* Since CHARSET is not yet invoked to any graphic planes, we \
4131 must invoke it, or, at first, designate it to some graphic \
4132 register. Then repeat the loop to actually produce the \
4134 dst = encode_invocation_designation (charset, coding, dst, \
4139 #define ENCODE_ISO_CHARACTER(charset, c) \
4142 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4144 if (CHARSET_DIMENSION (charset) == 1) \
4145 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4147 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4151 /* Produce designation and invocation codes at a place pointed by DST
4152 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4155 static unsigned char *
4156 encode_invocation_designation (struct charset
*charset
,
4157 struct coding_system
*coding
,
4158 unsigned char *dst
, ptrdiff_t *p_nchars
)
4160 bool multibytep
= coding
->dst_multibyte
;
4161 ptrdiff_t produced_chars
= *p_nchars
;
4162 int reg
; /* graphic register number */
4163 int id
= CHARSET_ID (charset
);
4165 /* At first, check designations. */
4166 for (reg
= 0; reg
< 4; reg
++)
4167 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
4172 /* CHARSET is not yet designated to any graphic registers. */
4173 /* At first check the requested designation. */
4174 reg
= CODING_ISO_REQUEST (coding
, id
);
4176 /* Since CHARSET requests no special designation, designate it
4177 to graphic register 0. */
4180 ENCODE_DESIGNATION (charset
, reg
, coding
);
4183 if (CODING_ISO_INVOCATION (coding
, 0) != reg
4184 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
4186 /* Since the graphic register REG is not invoked to any graphic
4187 planes, invoke it to graphic plane 0. */
4190 case 0: /* graphic register 0 */
4194 case 1: /* graphic register 1 */
4198 case 2: /* graphic register 2 */
4199 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4200 ENCODE_SINGLE_SHIFT_2
;
4202 ENCODE_LOCKING_SHIFT_2
;
4205 case 3: /* graphic register 3 */
4206 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4207 ENCODE_SINGLE_SHIFT_3
;
4209 ENCODE_LOCKING_SHIFT_3
;
4214 *p_nchars
= produced_chars
;
4219 /* Produce codes for designation and invocation to reset the graphic
4220 planes and registers to initial state. */
4221 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4224 struct charset *charset; \
4226 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4228 for (reg = 0; reg < 4; reg++) \
4229 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4230 && (CODING_ISO_DESIGNATION (coding, reg) \
4231 != CODING_ISO_INITIAL (coding, reg))) \
4233 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4234 ENCODE_DESIGNATION (charset, reg, coding); \
4239 /* Produce designation sequences of charsets in the line started from
4240 CHARBUF to a place pointed by DST, and return the number of
4241 produced bytes. DST should not directly point a buffer text area
4242 which may be relocated by char_charset call.
4244 If the current block ends before any end-of-line, we may fail to
4245 find all the necessary designations. */
4248 encode_designation_at_bol (struct coding_system
*coding
,
4249 int *charbuf
, int *charbuf_end
,
4252 unsigned char *orig
= dst
;
4253 struct charset
*charset
;
4254 /* Table of charsets to be designated to each graphic register. */
4256 int c
, found
= 0, reg
;
4257 ptrdiff_t produced_chars
= 0;
4258 bool multibytep
= coding
->dst_multibyte
;
4260 Lisp_Object charset_list
;
4262 attrs
= CODING_ID_ATTRS (coding
->id
);
4263 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4264 if (EQ (charset_list
, Qiso_2022
))
4265 charset_list
= Viso_2022_charset_list
;
4267 for (reg
= 0; reg
< 4; reg
++)
4270 while (charbuf
< charbuf_end
&& found
< 4)
4277 charset
= char_charset (c
, charset_list
, NULL
);
4278 id
= CHARSET_ID (charset
);
4279 reg
= CODING_ISO_REQUEST (coding
, id
);
4280 if (reg
>= 0 && r
[reg
] < 0)
4289 for (reg
= 0; reg
< 4; reg
++)
4291 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
4292 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
4298 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4301 encode_coding_iso_2022 (struct coding_system
*coding
)
4303 bool multibytep
= coding
->dst_multibyte
;
4304 int *charbuf
= coding
->charbuf
;
4305 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4306 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4307 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4309 bool bol_designation
4310 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4311 && CODING_ISO_BOL (coding
));
4312 ptrdiff_t produced_chars
= 0;
4313 Lisp_Object attrs
, eol_type
, charset_list
;
4314 bool ascii_compatible
;
4316 int preferred_charset_id
= -1;
4318 CODING_GET_INFO (coding
, attrs
, charset_list
);
4319 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
4320 if (VECTORP (eol_type
))
4323 setup_iso_safe_charsets (attrs
);
4324 /* Charset list may have been changed. */
4325 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4326 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
4329 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
))
4330 && ! (CODING_ISO_FLAGS (coding
) & (CODING_ISO_FLAG_DESIGNATION
4331 | CODING_ISO_FLAG_LOCKING_SHIFT
)));
4333 while (charbuf
< charbuf_end
)
4335 ASSURE_DESTINATION (safe_room
);
4337 if (bol_designation
)
4339 /* We have to produce designation sequences if any now. */
4340 unsigned char desig_buf
[16];
4344 charset_map_loaded
= 0;
4345 nbytes
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
,
4347 if (charset_map_loaded
4348 && (offset
= coding_change_destination (coding
)))
4353 memcpy (dst
, desig_buf
, nbytes
);
4355 /* We are sure that designation sequences are all ASCII bytes. */
4356 produced_chars
+= nbytes
;
4357 bol_designation
= 0;
4358 ASSURE_DESTINATION (safe_room
);
4365 /* Handle an annotation. */
4368 case CODING_ANNOTATE_COMPOSITION_MASK
:
4369 /* Not yet implemented. */
4371 case CODING_ANNOTATE_CHARSET_MASK
:
4372 preferred_charset_id
= charbuf
[2];
4373 if (preferred_charset_id
>= 0
4374 && NILP (Fmemq (make_number (preferred_charset_id
),
4376 preferred_charset_id
= -1;
4385 /* Now encode the character C. */
4386 if (c
< 0x20 || c
== 0x7F)
4389 || (c
== '\r' && EQ (eol_type
, Qmac
)))
4391 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4392 ENCODE_RESET_PLANE_AND_REGISTER ();
4393 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
4397 for (i
= 0; i
< 4; i
++)
4398 CODING_ISO_DESIGNATION (coding
, i
)
4399 = CODING_ISO_INITIAL (coding
, i
);
4401 bol_designation
= ((CODING_ISO_FLAGS (coding
)
4402 & CODING_ISO_FLAG_DESIGNATE_AT_BOL
)
4405 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
4406 ENCODE_RESET_PLANE_AND_REGISTER ();
4407 EMIT_ONE_ASCII_BYTE (c
);
4409 else if (ASCII_CHAR_P (c
))
4411 if (ascii_compatible
)
4412 EMIT_ONE_ASCII_BYTE (c
);
4415 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
4416 ENCODE_ISO_CHARACTER (charset
, c
);
4419 else if (CHAR_BYTE8_P (c
))
4421 c
= CHAR_TO_BYTE8 (c
);
4426 struct charset
*charset
;
4428 if (preferred_charset_id
>= 0)
4432 charset
= CHARSET_FROM_ID (preferred_charset_id
);
4433 CODING_CHAR_CHARSET_P (coding
, dst
, dst_end
, c
, charset
, result
);
4435 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4439 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4443 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4445 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4446 charset
= CHARSET_FROM_ID (charset_ascii
);
4450 c
= coding
->default_char
;
4451 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
4452 charset_list
, NULL
, charset
);
4455 ENCODE_ISO_CHARACTER (charset
, c
);
4459 if (coding
->mode
& CODING_MODE_LAST_BLOCK
4460 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4462 ASSURE_DESTINATION (safe_room
);
4463 ENCODE_RESET_PLANE_AND_REGISTER ();
4465 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4466 CODING_ISO_BOL (coding
) = bol_designation
;
4467 coding
->produced_char
+= produced_chars
;
4468 coding
->produced
= dst
- coding
->destination
;
4473 /*** 8,9. SJIS and BIG5 handlers ***/
4475 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4476 quite widely. So, for the moment, Emacs supports them in the bare
4477 C code. But, in the future, they may be supported only by CCL. */
4479 /* SJIS is a coding system encoding three character sets: ASCII, right
4480 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4481 as is. A character of charset katakana-jisx0201 is encoded by
4482 "position-code + 0x80". A character of charset japanese-jisx0208
4483 is encoded in 2-byte but two position-codes are divided and shifted
4484 so that it fit in the range below.
4486 --- CODE RANGE of SJIS ---
4487 (character set) (range)
4489 KATAKANA-JISX0201 0xA0 .. 0xDF
4490 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4491 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4492 -------------------------------
4496 /* BIG5 is a coding system encoding two character sets: ASCII and
4497 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4498 character set and is encoded in two-byte.
4500 --- CODE RANGE of BIG5 ---
4501 (character set) (range)
4503 Big5 (1st byte) 0xA1 .. 0xFE
4504 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4505 --------------------------
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510 Return true if a text is encoded in SJIS. */
4513 detect_coding_sjis (struct coding_system
*coding
,
4514 struct coding_detection_info
*detect_info
)
4516 const unsigned char *src
= coding
->source
, *src_base
;
4517 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4518 bool multibytep
= coding
->src_multibyte
;
4519 ptrdiff_t consumed_chars
= 0;
4522 Lisp_Object attrs
, charset_list
;
4523 int max_first_byte_of_2_byte_code
;
4525 CODING_GET_INFO (coding
, attrs
, charset_list
);
4526 max_first_byte_of_2_byte_code
4527 = (XINT (Flength (charset_list
)) > 3 ? 0xFC : 0xEF);
4529 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
4530 /* A coding system of this category is always ASCII compatible. */
4531 src
+= coding
->head_ascii
;
4539 if ((c
>= 0x81 && c
<= 0x9F)
4540 || (c
>= 0xE0 && c
<= max_first_byte_of_2_byte_code
))
4543 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
4545 found
= CATEGORY_MASK_SJIS
;
4547 else if (c
>= 0xA0 && c
< 0xE0)
4548 found
= CATEGORY_MASK_SJIS
;
4552 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4556 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4558 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4561 detect_info
->found
|= found
;
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Return true if a text is encoded in BIG5. */
4569 detect_coding_big5 (struct coding_system
*coding
,
4570 struct coding_detection_info
*detect_info
)
4572 const unsigned char *src
= coding
->source
, *src_base
;
4573 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4574 bool multibytep
= coding
->src_multibyte
;
4575 ptrdiff_t consumed_chars
= 0;
4579 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
4580 /* A coding system of this category is always ASCII compatible. */
4581 src
+= coding
->head_ascii
;
4592 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
4594 found
= CATEGORY_MASK_BIG5
;
4599 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4603 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4605 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4608 detect_info
->found
|= found
;
4612 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4615 decode_coding_sjis (struct coding_system
*coding
)
4617 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4618 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4619 const unsigned char *src_base
;
4620 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4621 /* We may produce one charset annotation in one loop and one more at
4624 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4625 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
4626 bool multibytep
= coding
->src_multibyte
;
4627 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4628 struct charset
*charset_kanji2
;
4629 Lisp_Object attrs
, charset_list
, val
;
4630 ptrdiff_t char_offset
= coding
->produced_char
;
4631 ptrdiff_t last_offset
= char_offset
;
4632 int last_id
= charset_ascii
;
4634 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4635 int byte_after_cr
= -1;
4637 CODING_GET_INFO (coding
, attrs
, charset_list
);
4640 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4641 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4642 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4643 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4648 struct charset
*charset
;
4651 consumed_chars_base
= consumed_chars
;
4653 if (charbuf
>= charbuf_end
)
4655 if (byte_after_cr
>= 0)
4660 if (byte_after_cr
>= 0)
4661 c
= byte_after_cr
, byte_after_cr
= -1;
4668 if (eol_dos
&& c
== '\r')
4669 ONE_MORE_BYTE (byte_after_cr
);
4670 charset
= charset_roman
;
4672 else if (c
== 0x80 || c
== 0xA0)
4674 else if (c
>= 0xA1 && c
<= 0xDF)
4676 /* SJIS -> JISX0201-Kana */
4678 charset
= charset_kana
;
4682 /* SJIS -> JISX0208 */
4684 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4688 charset
= charset_kanji
;
4690 else if (c
<= 0xFC && charset_kanji2
)
4692 /* SJIS -> JISX0213-2 */
4694 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4698 charset
= charset_kanji2
;
4702 if (charset
->id
!= charset_ascii
4703 && last_id
!= charset
->id
)
4705 if (last_id
!= charset_ascii
)
4706 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4707 last_id
= charset
->id
;
4708 last_offset
= char_offset
;
4710 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4717 consumed_chars
= consumed_chars_base
;
4719 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4725 if (last_id
!= charset_ascii
)
4726 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4727 coding
->consumed_char
+= consumed_chars_base
;
4728 coding
->consumed
= src_base
- coding
->source
;
4729 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4733 decode_coding_big5 (struct coding_system
*coding
)
4735 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4736 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4737 const unsigned char *src_base
;
4738 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4739 /* We may produce one charset annotation in one loop and one more at
4742 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4743 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
4744 bool multibytep
= coding
->src_multibyte
;
4745 struct charset
*charset_roman
, *charset_big5
;
4746 Lisp_Object attrs
, charset_list
, val
;
4747 ptrdiff_t char_offset
= coding
->produced_char
;
4748 ptrdiff_t last_offset
= char_offset
;
4749 int last_id
= charset_ascii
;
4751 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4752 int byte_after_cr
= -1;
4754 CODING_GET_INFO (coding
, attrs
, charset_list
);
4756 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4757 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4762 struct charset
*charset
;
4765 consumed_chars_base
= consumed_chars
;
4767 if (charbuf
>= charbuf_end
)
4769 if (byte_after_cr
>= 0)
4774 if (byte_after_cr
>= 0)
4775 c
= byte_after_cr
, byte_after_cr
= -1;
4783 if (eol_dos
&& c
== '\r')
4784 ONE_MORE_BYTE (byte_after_cr
);
4785 charset
= charset_roman
;
4790 if (c
< 0xA1 || c
> 0xFE)
4793 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4796 charset
= charset_big5
;
4798 if (charset
->id
!= charset_ascii
4799 && last_id
!= charset
->id
)
4801 if (last_id
!= charset_ascii
)
4802 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4803 last_id
= charset
->id
;
4804 last_offset
= char_offset
;
4806 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4813 consumed_chars
= consumed_chars_base
;
4815 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4821 if (last_id
!= charset_ascii
)
4822 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4823 coding
->consumed_char
+= consumed_chars_base
;
4824 coding
->consumed
= src_base
- coding
->source
;
4825 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4828 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4829 This function can encode charsets `ascii', `katakana-jisx0201',
4830 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4831 are sure that all these charsets are registered as official charset
4832 (i.e. do not have extended leading-codes). Characters of other
4833 charsets are produced without any encoding. */
4836 encode_coding_sjis (struct coding_system
*coding
)
4838 bool multibytep
= coding
->dst_multibyte
;
4839 int *charbuf
= coding
->charbuf
;
4840 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4841 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4842 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4844 ptrdiff_t produced_chars
= 0;
4845 Lisp_Object attrs
, charset_list
, val
;
4846 bool ascii_compatible
;
4847 struct charset
*charset_kanji
, *charset_kana
;
4848 struct charset
*charset_kanji2
;
4851 CODING_GET_INFO (coding
, attrs
, charset_list
);
4852 val
= XCDR (charset_list
);
4853 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4854 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4855 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4857 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4859 while (charbuf
< charbuf_end
)
4861 ASSURE_DESTINATION (safe_room
);
4863 /* Now encode the character C. */
4864 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4865 EMIT_ONE_ASCII_BYTE (c
);
4866 else if (CHAR_BYTE8_P (c
))
4868 c
= CHAR_TO_BYTE8 (c
);
4874 struct charset
*charset
;
4875 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4880 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4882 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4883 charset
= CHARSET_FROM_ID (charset_ascii
);
4887 c
= coding
->default_char
;
4888 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
4889 charset_list
, &code
, charset
);
4892 if (code
== CHARSET_INVALID_CODE (charset
))
4894 if (charset
== charset_kanji
)
4898 c1
= code
>> 8, c2
= code
& 0xFF;
4899 EMIT_TWO_BYTES (c1
, c2
);
4901 else if (charset
== charset_kana
)
4902 EMIT_ONE_BYTE (code
| 0x80);
4903 else if (charset_kanji2
&& charset
== charset_kanji2
)
4908 if (c1
== 0x21 || (c1
>= 0x23 && c1
<= 0x25)
4910 || (c1
>= 0x2C && c1
<= 0x2F) || c1
>= 0x6E)
4912 JIS_TO_SJIS2 (code
);
4913 c1
= code
>> 8, c2
= code
& 0xFF;
4914 EMIT_TWO_BYTES (c1
, c2
);
4917 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4920 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4923 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4924 coding
->produced_char
+= produced_chars
;
4925 coding
->produced
= dst
- coding
->destination
;
4930 encode_coding_big5 (struct coding_system
*coding
)
4932 bool multibytep
= coding
->dst_multibyte
;
4933 int *charbuf
= coding
->charbuf
;
4934 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4935 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4936 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4938 ptrdiff_t produced_chars
= 0;
4939 Lisp_Object attrs
, charset_list
, val
;
4940 bool ascii_compatible
;
4941 struct charset
*charset_big5
;
4944 CODING_GET_INFO (coding
, attrs
, charset_list
);
4945 val
= XCDR (charset_list
);
4946 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4947 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4949 while (charbuf
< charbuf_end
)
4951 ASSURE_DESTINATION (safe_room
);
4953 /* Now encode the character C. */
4954 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4955 EMIT_ONE_ASCII_BYTE (c
);
4956 else if (CHAR_BYTE8_P (c
))
4958 c
= CHAR_TO_BYTE8 (c
);
4964 struct charset
*charset
;
4965 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4970 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4972 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4973 charset
= CHARSET_FROM_ID (charset_ascii
);
4977 c
= coding
->default_char
;
4978 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
4979 charset_list
, &code
, charset
);
4982 if (code
== CHARSET_INVALID_CODE (charset
))
4984 if (charset
== charset_big5
)
4988 c1
= code
>> 8, c2
= code
& 0xFF;
4989 EMIT_TWO_BYTES (c1
, c2
);
4992 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4995 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4996 coding
->produced_char
+= produced_chars
;
4997 coding
->produced
= dst
- coding
->destination
;
5002 /*** 10. CCL handlers ***/
5004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5005 Return true if a text is encoded in a coding system of which
5006 encoder/decoder are written in CCL program. */
5009 detect_coding_ccl (struct coding_system
*coding
,
5010 struct coding_detection_info
*detect_info
)
5012 const unsigned char *src
= coding
->source
, *src_base
;
5013 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5014 bool multibytep
= coding
->src_multibyte
;
5015 ptrdiff_t consumed_chars
= 0;
5017 unsigned char *valids
;
5018 ptrdiff_t head_ascii
= coding
->head_ascii
;
5021 detect_info
->checked
|= CATEGORY_MASK_CCL
;
5023 coding
= &coding_categories
[coding_category_ccl
];
5024 valids
= CODING_CCL_VALIDS (coding
);
5025 attrs
= CODING_ID_ATTRS (coding
->id
);
5026 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5035 if (c
< 0 || ! valids
[c
])
5037 if ((valids
[c
] > 1))
5038 found
= CATEGORY_MASK_CCL
;
5040 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
5044 detect_info
->found
|= found
;
5049 decode_coding_ccl (struct coding_system
*coding
)
5051 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5052 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5053 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5054 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
5055 ptrdiff_t consumed_chars
= 0;
5056 bool multibytep
= coding
->src_multibyte
;
5057 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5058 int source_charbuf
[1024];
5059 int source_byteidx
[1025];
5060 Lisp_Object attrs
, charset_list
;
5062 CODING_GET_INFO (coding
, attrs
, charset_list
);
5066 const unsigned char *p
= src
;
5072 while (i
< 1024 && p
< src_end
)
5074 source_byteidx
[i
] = p
- src
;
5075 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
5077 source_byteidx
[i
] = p
- src
;
5080 while (i
< 1024 && p
< src_end
)
5081 source_charbuf
[i
++] = *p
++;
5083 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
5084 ccl
->last_block
= 1;
5085 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5086 charset_map_loaded
= 0;
5087 ccl_driver (ccl
, source_charbuf
, charbuf
, i
, charbuf_end
- charbuf
,
5089 if (charset_map_loaded
5090 && (offset
= coding_change_source (coding
)))
5096 charbuf
+= ccl
->produced
;
5098 src
+= source_byteidx
[ccl
->consumed
];
5100 src
+= ccl
->consumed
;
5101 consumed_chars
+= ccl
->consumed
;
5102 if (p
== src_end
|| ccl
->status
!= CCL_STAT_SUSPEND_BY_SRC
)
5106 switch (ccl
->status
)
5108 case CCL_STAT_SUSPEND_BY_SRC
:
5109 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5111 case CCL_STAT_SUSPEND_BY_DST
:
5112 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5115 case CCL_STAT_INVALID_CMD
:
5116 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5119 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5122 coding
->consumed_char
+= consumed_chars
;
5123 coding
->consumed
= src
- coding
->source
;
5124 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5128 encode_coding_ccl (struct coding_system
*coding
)
5130 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5131 bool multibytep
= coding
->dst_multibyte
;
5132 int *charbuf
= coding
->charbuf
;
5133 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5134 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5135 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5136 int destination_charbuf
[1024];
5137 ptrdiff_t produced_chars
= 0;
5139 Lisp_Object attrs
, charset_list
;
5141 CODING_GET_INFO (coding
, attrs
, charset_list
);
5142 if (coding
->consumed_char
== coding
->src_chars
5143 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
5144 ccl
->last_block
= 1;
5150 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5151 charset_map_loaded
= 0;
5152 ccl_driver (ccl
, charbuf
, destination_charbuf
,
5153 charbuf_end
- charbuf
, 1024, charset_list
);
5154 if (charset_map_loaded
5155 && (offset
= coding_change_destination (coding
)))
5159 ASSURE_DESTINATION (ccl
->produced
* 2);
5160 for (i
= 0; i
< ccl
->produced
; i
++)
5161 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
5165 ASSURE_DESTINATION (ccl
->produced
);
5166 for (i
= 0; i
< ccl
->produced
; i
++)
5167 *dst
++ = destination_charbuf
[i
] & 0xFF;
5168 produced_chars
+= ccl
->produced
;
5170 charbuf
+= ccl
->consumed
;
5171 if (ccl
->status
== CCL_STAT_QUIT
5172 || ccl
->status
== CCL_STAT_INVALID_CMD
)
5175 while (charbuf
< charbuf_end
);
5177 switch (ccl
->status
)
5179 case CCL_STAT_SUSPEND_BY_SRC
:
5180 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5182 case CCL_STAT_SUSPEND_BY_DST
:
5183 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5186 case CCL_STAT_INVALID_CMD
:
5187 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5190 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5194 coding
->produced_char
+= produced_chars
;
5195 coding
->produced
= dst
- coding
->destination
;
5200 /*** 10, 11. no-conversion handlers ***/
5202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5205 decode_coding_raw_text (struct coding_system
*coding
)
5208 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5210 coding
->chars_at_source
= 1;
5211 coding
->consumed_char
= coding
->src_chars
;
5212 coding
->consumed
= coding
->src_bytes
;
5213 if (eol_dos
&& coding
->source
[coding
->src_bytes
- 1] == '\r')
5215 coding
->consumed_char
--;
5217 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5220 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5224 encode_coding_raw_text (struct coding_system
*coding
)
5226 bool multibytep
= coding
->dst_multibyte
;
5227 int *charbuf
= coding
->charbuf
;
5228 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
5229 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5230 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5231 ptrdiff_t produced_chars
= 0;
5236 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
5238 if (coding
->src_multibyte
)
5239 while (charbuf
< charbuf_end
)
5241 ASSURE_DESTINATION (safe_room
);
5243 if (ASCII_CHAR_P (c
))
5244 EMIT_ONE_ASCII_BYTE (c
);
5245 else if (CHAR_BYTE8_P (c
))
5247 c
= CHAR_TO_BYTE8 (c
);
5252 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
5254 CHAR_STRING_ADVANCE (c
, p1
);
5257 EMIT_ONE_BYTE (*p0
);
5264 while (charbuf
< charbuf_end
)
5266 ASSURE_DESTINATION (safe_room
);
5273 if (coding
->src_multibyte
)
5275 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5277 while (charbuf
< charbuf_end
)
5279 ASSURE_DESTINATION (safe_room
);
5281 if (ASCII_CHAR_P (c
))
5283 else if (CHAR_BYTE8_P (c
))
5284 *dst
++ = CHAR_TO_BYTE8 (c
);
5286 CHAR_STRING_ADVANCE (c
, dst
);
5291 ASSURE_DESTINATION (charbuf_end
- charbuf
);
5292 while (charbuf
< charbuf_end
&& dst
< dst_end
)
5293 *dst
++ = *charbuf
++;
5295 produced_chars
= dst
- (coding
->destination
+ coding
->produced
);
5297 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5298 coding
->produced_char
+= produced_chars
;
5299 coding
->produced
= dst
- coding
->destination
;
5303 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5304 Return true if a text is encoded in a charset-based coding system. */
5307 detect_coding_charset (struct coding_system
*coding
,
5308 struct coding_detection_info
*detect_info
)
5310 const unsigned char *src
= coding
->source
, *src_base
;
5311 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5312 bool multibytep
= coding
->src_multibyte
;
5313 ptrdiff_t consumed_chars
= 0;
5314 Lisp_Object attrs
, valids
, name
;
5316 ptrdiff_t head_ascii
= coding
->head_ascii
;
5317 bool check_latin_extra
= 0;
5319 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
5321 coding
= &coding_categories
[coding_category_charset
];
5322 attrs
= CODING_ID_ATTRS (coding
->id
);
5323 valids
= AREF (attrs
, coding_attr_charset_valids
);
5324 name
= CODING_ID_NAME (coding
->id
);
5325 if (strncmp (SSDATA (SYMBOL_NAME (name
)),
5326 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5327 || strncmp (SSDATA (SYMBOL_NAME (name
)),
5328 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5329 check_latin_extra
= 1;
5331 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5338 struct charset
*charset
;
5345 val
= AREF (valids
, c
);
5351 && check_latin_extra
5352 && (!VECTORP (Vlatin_extra_code_table
)
5353 || NILP (AREF (Vlatin_extra_code_table
, c
))))
5355 found
= CATEGORY_MASK_CHARSET
;
5359 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5360 dim
= CHARSET_DIMENSION (charset
);
5361 for (idx
= 1; idx
< dim
; idx
++)
5366 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
5367 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
5376 for (; CONSP (val
); val
= XCDR (val
))
5378 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5379 dim
= CHARSET_DIMENSION (charset
);
5385 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
5386 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
5401 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
5405 detect_info
->found
|= found
;
5410 decode_coding_charset (struct coding_system
*coding
)
5412 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5413 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5414 const unsigned char *src_base
;
5415 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5416 /* We may produce one charset annotation in one loop and one more at
5419 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
5420 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
5421 bool multibytep
= coding
->src_multibyte
;
5422 Lisp_Object attrs
= CODING_ID_ATTRS (coding
->id
);
5424 ptrdiff_t char_offset
= coding
->produced_char
;
5425 ptrdiff_t last_offset
= char_offset
;
5426 int last_id
= charset_ascii
;
5428 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5429 int byte_after_cr
= -1;
5431 valids
= AREF (attrs
, coding_attr_charset_valids
);
5437 struct charset
*charset
;
5443 consumed_chars_base
= consumed_chars
;
5445 if (charbuf
>= charbuf_end
)
5447 if (byte_after_cr
>= 0)
5452 if (byte_after_cr
>= 0)
5460 if (eol_dos
&& c
== '\r')
5461 ONE_MORE_BYTE (byte_after_cr
);
5467 val
= AREF (valids
, c
);
5468 if (! INTEGERP (val
) && ! CONSP (val
))
5472 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5473 dim
= CHARSET_DIMENSION (charset
);
5477 code
= (code
<< 8) | c
;
5480 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
5485 /* VAL is a list of charset IDs. It is assured that the
5486 list is sorted by charset dimensions (smaller one
5490 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5491 dim
= CHARSET_DIMENSION (charset
);
5495 code
= (code
<< 8) | c
;
5498 CODING_DECODE_CHAR (coding
, src
, src_base
,
5499 src_end
, charset
, code
, c
);
5507 if (charset
->id
!= charset_ascii
5508 && last_id
!= charset
->id
)
5510 if (last_id
!= charset_ascii
)
5511 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5512 last_id
= charset
->id
;
5513 last_offset
= char_offset
;
5522 consumed_chars
= consumed_chars_base
;
5524 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
5530 if (last_id
!= charset_ascii
)
5531 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5532 coding
->consumed_char
+= consumed_chars_base
;
5533 coding
->consumed
= src_base
- coding
->source
;
5534 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5538 encode_coding_charset (struct coding_system
*coding
)
5540 bool multibytep
= coding
->dst_multibyte
;
5541 int *charbuf
= coding
->charbuf
;
5542 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5543 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5544 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5545 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5546 ptrdiff_t produced_chars
= 0;
5547 Lisp_Object attrs
, charset_list
;
5548 bool ascii_compatible
;
5551 CODING_GET_INFO (coding
, attrs
, charset_list
);
5552 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
5554 while (charbuf
< charbuf_end
)
5556 struct charset
*charset
;
5559 ASSURE_DESTINATION (safe_room
);
5561 if (ascii_compatible
&& ASCII_CHAR_P (c
))
5562 EMIT_ONE_ASCII_BYTE (c
);
5563 else if (CHAR_BYTE8_P (c
))
5565 c
= CHAR_TO_BYTE8 (c
);
5570 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
5575 if (CHARSET_DIMENSION (charset
) == 1)
5576 EMIT_ONE_BYTE (code
);
5577 else if (CHARSET_DIMENSION (charset
) == 2)
5578 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
5579 else if (CHARSET_DIMENSION (charset
) == 3)
5580 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
5582 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
5583 (code
>> 8) & 0xFF, code
& 0xFF);
5587 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5588 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5590 c
= coding
->default_char
;
5596 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5597 coding
->produced_char
+= produced_chars
;
5598 coding
->produced
= dst
- coding
->destination
;
5603 /*** 7. C library functions ***/
5605 /* Setup coding context CODING from information about CODING_SYSTEM.
5606 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5607 CODING_SYSTEM is invalid, signal an error. */
5610 setup_coding_system (Lisp_Object coding_system
, struct coding_system
*coding
)
5613 Lisp_Object eol_type
;
5614 Lisp_Object coding_type
;
5617 if (NILP (coding_system
))
5618 coding_system
= Qundecided
;
5620 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
5622 attrs
= CODING_ID_ATTRS (coding
->id
);
5623 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
5626 coding
->head_ascii
= -1;
5627 if (VECTORP (eol_type
))
5628 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5629 | CODING_REQUIRE_DETECTION_MASK
);
5630 else if (! EQ (eol_type
, Qunix
))
5631 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5632 | CODING_REQUIRE_ENCODING_MASK
);
5634 coding
->common_flags
= 0;
5635 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5636 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5637 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5638 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5639 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
5640 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
5642 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5643 coding
->max_charset_id
= SCHARS (val
) - 1;
5644 coding
->safe_charsets
= SDATA (val
);
5645 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
5646 coding
->carryover_bytes
= 0;
5648 coding_type
= CODING_ATTR_TYPE (attrs
);
5649 if (EQ (coding_type
, Qundecided
))
5651 coding
->detector
= NULL
;
5652 coding
->decoder
= decode_coding_raw_text
;
5653 coding
->encoder
= encode_coding_raw_text
;
5654 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5656 else if (EQ (coding_type
, Qiso_2022
))
5659 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5661 /* Invoke graphic register 0 to plane 0. */
5662 CODING_ISO_INVOCATION (coding
, 0) = 0;
5663 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5664 CODING_ISO_INVOCATION (coding
, 1)
5665 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
5666 /* Setup the initial status of designation. */
5667 for (i
= 0; i
< 4; i
++)
5668 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
5669 /* Not single shifting initially. */
5670 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
5671 /* Beginning of buffer should also be regarded as bol. */
5672 CODING_ISO_BOL (coding
) = 1;
5673 coding
->detector
= detect_coding_iso_2022
;
5674 coding
->decoder
= decode_coding_iso_2022
;
5675 coding
->encoder
= encode_coding_iso_2022
;
5676 if (flags
& CODING_ISO_FLAG_SAFE
)
5677 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
5678 coding
->common_flags
5679 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5680 | CODING_REQUIRE_FLUSHING_MASK
);
5681 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
5682 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
5683 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
5684 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
5685 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5687 setup_iso_safe_charsets (attrs
);
5688 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5689 coding
->max_charset_id
= SCHARS (val
) - 1;
5690 coding
->safe_charsets
= SDATA (val
);
5692 CODING_ISO_FLAGS (coding
) = flags
;
5693 CODING_ISO_CMP_STATUS (coding
)->state
= COMPOSING_NO
;
5694 CODING_ISO_CMP_STATUS (coding
)->method
= COMPOSITION_NO
;
5695 CODING_ISO_EXTSEGMENT_LEN (coding
) = 0;
5696 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
5698 else if (EQ (coding_type
, Qcharset
))
5700 coding
->detector
= detect_coding_charset
;
5701 coding
->decoder
= decode_coding_charset
;
5702 coding
->encoder
= encode_coding_charset
;
5703 coding
->common_flags
5704 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5706 else if (EQ (coding_type
, Qutf_8
))
5708 val
= AREF (attrs
, coding_attr_utf_bom
);
5709 CODING_UTF_8_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5710 : EQ (val
, Qt
) ? utf_with_bom
5712 coding
->detector
= detect_coding_utf_8
;
5713 coding
->decoder
= decode_coding_utf_8
;
5714 coding
->encoder
= encode_coding_utf_8
;
5715 coding
->common_flags
5716 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5717 if (CODING_UTF_8_BOM (coding
) == utf_detect_bom
)
5718 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5720 else if (EQ (coding_type
, Qutf_16
))
5722 val
= AREF (attrs
, coding_attr_utf_bom
);
5723 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5724 : EQ (val
, Qt
) ? utf_with_bom
5726 val
= AREF (attrs
, coding_attr_utf_16_endian
);
5727 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
5728 : utf_16_little_endian
);
5729 CODING_UTF_16_SURROGATE (coding
) = 0;
5730 coding
->detector
= detect_coding_utf_16
;
5731 coding
->decoder
= decode_coding_utf_16
;
5732 coding
->encoder
= encode_coding_utf_16
;
5733 coding
->common_flags
5734 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5735 if (CODING_UTF_16_BOM (coding
) == utf_detect_bom
)
5736 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5738 else if (EQ (coding_type
, Qccl
))
5740 coding
->detector
= detect_coding_ccl
;
5741 coding
->decoder
= decode_coding_ccl
;
5742 coding
->encoder
= encode_coding_ccl
;
5743 coding
->common_flags
5744 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5745 | CODING_REQUIRE_FLUSHING_MASK
);
5747 else if (EQ (coding_type
, Qemacs_mule
))
5749 coding
->detector
= detect_coding_emacs_mule
;
5750 coding
->decoder
= decode_coding_emacs_mule
;
5751 coding
->encoder
= encode_coding_emacs_mule
;
5752 coding
->common_flags
5753 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5754 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
5755 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
5757 Lisp_Object tail
, safe_charsets
;
5758 int max_charset_id
= 0;
5760 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5762 if (max_charset_id
< XFASTINT (XCAR (tail
)))
5763 max_charset_id
= XFASTINT (XCAR (tail
));
5764 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
5765 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
5766 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5768 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
5769 coding
->max_charset_id
= max_charset_id
;
5770 coding
->safe_charsets
= SDATA (safe_charsets
);
5772 coding
->spec
.emacs_mule
.cmp_status
.state
= COMPOSING_NO
;
5773 coding
->spec
.emacs_mule
.cmp_status
.method
= COMPOSITION_NO
;
5775 else if (EQ (coding_type
, Qshift_jis
))
5777 coding
->detector
= detect_coding_sjis
;
5778 coding
->decoder
= decode_coding_sjis
;
5779 coding
->encoder
= encode_coding_sjis
;
5780 coding
->common_flags
5781 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5783 else if (EQ (coding_type
, Qbig5
))
5785 coding
->detector
= detect_coding_big5
;
5786 coding
->decoder
= decode_coding_big5
;
5787 coding
->encoder
= encode_coding_big5
;
5788 coding
->common_flags
5789 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5791 else /* EQ (coding_type, Qraw_text) */
5793 coding
->detector
= NULL
;
5794 coding
->decoder
= decode_coding_raw_text
;
5795 coding
->encoder
= encode_coding_raw_text
;
5796 if (! EQ (eol_type
, Qunix
))
5798 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5799 if (! VECTORP (eol_type
))
5800 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5808 /* Return a list of charsets supported by CODING. */
5811 coding_charset_list (struct coding_system
*coding
)
5813 Lisp_Object attrs
, charset_list
;
5815 CODING_GET_INFO (coding
, attrs
, charset_list
);
5816 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5818 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5820 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5821 charset_list
= Viso_2022_charset_list
;
5823 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5825 charset_list
= Vemacs_mule_charset_list
;
5827 return charset_list
;
5831 /* Return a list of charsets supported by CODING-SYSTEM. */
5834 coding_system_charset_list (Lisp_Object coding_system
)
5837 Lisp_Object attrs
, charset_list
;
5839 CHECK_CODING_SYSTEM_GET_ID (coding_system
, id
);
5840 attrs
= CODING_ID_ATTRS (id
);
5842 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5844 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5846 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5847 charset_list
= Viso_2022_charset_list
;
5849 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
5851 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5853 charset_list
= Vemacs_mule_charset_list
;
5857 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
5859 return charset_list
;
5863 /* Return raw-text or one of its subsidiaries that has the same
5864 eol_type as CODING-SYSTEM. */
5867 raw_text_coding_system (Lisp_Object coding_system
)
5869 Lisp_Object spec
, attrs
;
5870 Lisp_Object eol_type
, raw_text_eol_type
;
5872 if (NILP (coding_system
))
5874 spec
= CODING_SYSTEM_SPEC (coding_system
);
5875 attrs
= AREF (spec
, 0);
5877 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5878 return coding_system
;
5880 eol_type
= AREF (spec
, 2);
5881 if (VECTORP (eol_type
))
5883 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5884 raw_text_eol_type
= AREF (spec
, 2);
5885 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5886 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5887 : AREF (raw_text_eol_type
, 2));
5891 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5892 the subsidiary that has the same eol-spec as PARENT (if it is not
5893 nil and specifies end-of-line format) or the system's setting
5894 (system_eol_type). */
5897 coding_inherit_eol_type (Lisp_Object coding_system
, Lisp_Object parent
)
5899 Lisp_Object spec
, eol_type
;
5901 if (NILP (coding_system
))
5902 coding_system
= Qraw_text
;
5903 spec
= CODING_SYSTEM_SPEC (coding_system
);
5904 eol_type
= AREF (spec
, 2);
5905 if (VECTORP (eol_type
))
5907 Lisp_Object parent_eol_type
;
5909 if (! NILP (parent
))
5911 Lisp_Object parent_spec
;
5913 parent_spec
= CODING_SYSTEM_SPEC (parent
);
5914 parent_eol_type
= AREF (parent_spec
, 2);
5915 if (VECTORP (parent_eol_type
))
5916 parent_eol_type
= system_eol_type
;
5919 parent_eol_type
= system_eol_type
;
5920 if (EQ (parent_eol_type
, Qunix
))
5921 coding_system
= AREF (eol_type
, 0);
5922 else if (EQ (parent_eol_type
, Qdos
))
5923 coding_system
= AREF (eol_type
, 1);
5924 else if (EQ (parent_eol_type
, Qmac
))
5925 coding_system
= AREF (eol_type
, 2);
5927 return coding_system
;
5931 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5932 decided for writing to a process. If not, complement them, and
5933 return a new coding system. */
5936 complement_process_encoding_system (Lisp_Object coding_system
)
5938 Lisp_Object coding_base
= Qnil
, eol_base
= Qnil
;
5939 Lisp_Object spec
, attrs
;
5942 for (i
= 0; i
< 3; i
++)
5945 coding_system
= CDR_SAFE (Vdefault_process_coding_system
);
5947 coding_system
= preferred_coding_system ();
5948 spec
= CODING_SYSTEM_SPEC (coding_system
);
5951 attrs
= AREF (spec
, 0);
5952 if (NILP (coding_base
) && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
5953 coding_base
= CODING_ATTR_BASE_NAME (attrs
);
5954 if (NILP (eol_base
) && ! VECTORP (AREF (spec
, 2)))
5955 eol_base
= coding_system
;
5956 if (! NILP (coding_base
) && ! NILP (eol_base
))
5961 /* The original CODING_SYSTEM didn't specify text-conversion or
5962 eol-conversion. Be sure that we return a fully complemented
5964 coding_system
= coding_inherit_eol_type (coding_base
, eol_base
);
5965 return coding_system
;
5969 /* Emacs has a mechanism to automatically detect a coding system if it
5970 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5971 it's impossible to distinguish some coding systems accurately
5972 because they use the same range of codes. So, at first, coding
5973 systems are categorized into 7, those are:
5975 o coding-category-emacs-mule
5977 The category for a coding system which has the same code range
5978 as Emacs' internal format. Assigned the coding-system (Lisp
5979 symbol) `emacs-mule' by default.
5981 o coding-category-sjis
5983 The category for a coding system which has the same code range
5984 as SJIS. Assigned the coding-system (Lisp
5985 symbol) `japanese-shift-jis' by default.
5987 o coding-category-iso-7
5989 The category for a coding system which has the same code range
5990 as ISO2022 of 7-bit environment. This doesn't use any locking
5991 shift and single shift functions. This can encode/decode all
5992 charsets. Assigned the coding-system (Lisp symbol)
5993 `iso-2022-7bit' by default.
5995 o coding-category-iso-7-tight
5997 Same as coding-category-iso-7 except that this can
5998 encode/decode only the specified charsets.
6000 o coding-category-iso-8-1
6002 The category for a coding system which has the same code range
6003 as ISO2022 of 8-bit environment and graphic plane 1 used only
6004 for DIMENSION1 charset. This doesn't use any locking shift
6005 and single shift functions. Assigned the coding-system (Lisp
6006 symbol) `iso-latin-1' by default.
6008 o coding-category-iso-8-2
6010 The category for a coding system which has the same code range
6011 as ISO2022 of 8-bit environment and graphic plane 1 used only
6012 for DIMENSION2 charset. This doesn't use any locking shift
6013 and single shift functions. Assigned the coding-system (Lisp
6014 symbol) `japanese-iso-8bit' by default.
6016 o coding-category-iso-7-else
6018 The category for a coding system which has the same code range
6019 as ISO2022 of 7-bit environment but uses locking shift or
6020 single shift functions. Assigned the coding-system (Lisp
6021 symbol) `iso-2022-7bit-lock' by default.
6023 o coding-category-iso-8-else
6025 The category for a coding system which has the same code range
6026 as ISO2022 of 8-bit environment but uses locking shift or
6027 single shift functions. Assigned the coding-system (Lisp
6028 symbol) `iso-2022-8bit-ss2' by default.
6030 o coding-category-big5
6032 The category for a coding system which has the same code range
6033 as BIG5. Assigned the coding-system (Lisp symbol)
6034 `cn-big5' by default.
6036 o coding-category-utf-8
6038 The category for a coding system which has the same code range
6039 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6040 symbol) `utf-8' by default.
6042 o coding-category-utf-16-be
6044 The category for a coding system in which a text has an
6045 Unicode signature (cf. Unicode Standard) in the order of BIG
6046 endian at the head. Assigned the coding-system (Lisp symbol)
6047 `utf-16-be' by default.
6049 o coding-category-utf-16-le
6051 The category for a coding system in which a text has an
6052 Unicode signature (cf. Unicode Standard) in the order of
6053 LITTLE endian at the head. Assigned the coding-system (Lisp
6054 symbol) `utf-16-le' by default.
6056 o coding-category-ccl
6058 The category for a coding system of which encoder/decoder is
6059 written in CCL programs. The default value is nil, i.e., no
6060 coding system is assigned.
6062 o coding-category-binary
6064 The category for a coding system not categorized in any of the
6065 above. Assigned the coding-system (Lisp symbol)
6066 `no-conversion' by default.
6068 Each of them is a Lisp symbol and the value is an actual
6069 `coding-system's (this is also a Lisp symbol) assigned by a user.
6070 What Emacs does actually is to detect a category of coding system.
6071 Then, it uses a `coding-system' assigned to it. If Emacs can't
6072 decide only one possible category, it selects a category of the
6073 highest priority. Priorities of categories are also specified by a
6074 user in a Lisp variable `coding-category-list'.
6078 #define EOL_SEEN_NONE 0
6079 #define EOL_SEEN_LF 1
6080 #define EOL_SEEN_CR 2
6081 #define EOL_SEEN_CRLF 4
6083 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6084 SOURCE is encoded. If CATEGORY is one of
6085 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6086 two-byte, else they are encoded by one-byte.
6088 Return one of EOL_SEEN_XXX. */
6090 #define MAX_EOL_CHECK_COUNT 3
6093 detect_eol (const unsigned char *source
, ptrdiff_t src_bytes
,
6094 enum coding_category category
)
6096 const unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
6099 int eol_seen
= EOL_SEEN_NONE
;
6101 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
6103 bool msb
= category
== (coding_category_utf_16_le
6104 | coding_category_utf_16_le_nosig
);
6107 while (src
+ 1 < src_end
)
6110 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
6115 this_eol
= EOL_SEEN_LF
;
6116 else if (src
+ 3 >= src_end
6117 || src
[msb
+ 2] != 0
6118 || src
[lsb
+ 2] != '\n')
6119 this_eol
= EOL_SEEN_CR
;
6122 this_eol
= EOL_SEEN_CRLF
;
6126 if (eol_seen
== EOL_SEEN_NONE
)
6127 /* This is the first end-of-line. */
6128 eol_seen
= this_eol
;
6129 else if (eol_seen
!= this_eol
)
6131 /* The found type is different from what found before.
6132 Allow for stray ^M characters in DOS EOL files. */
6133 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6134 || (eol_seen
== EOL_SEEN_CRLF
6135 && this_eol
== EOL_SEEN_CR
))
6136 eol_seen
= EOL_SEEN_CRLF
;
6139 eol_seen
= EOL_SEEN_LF
;
6143 if (++total
== MAX_EOL_CHECK_COUNT
)
6150 while (src
< src_end
)
6153 if (c
== '\n' || c
== '\r')
6158 this_eol
= EOL_SEEN_LF
;
6159 else if (src
>= src_end
|| *src
!= '\n')
6160 this_eol
= EOL_SEEN_CR
;
6162 this_eol
= EOL_SEEN_CRLF
, src
++;
6164 if (eol_seen
== EOL_SEEN_NONE
)
6165 /* This is the first end-of-line. */
6166 eol_seen
= this_eol
;
6167 else if (eol_seen
!= this_eol
)
6169 /* The found type is different from what found before.
6170 Allow for stray ^M characters in DOS EOL files. */
6171 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6172 || (eol_seen
== EOL_SEEN_CRLF
&& this_eol
== EOL_SEEN_CR
))
6173 eol_seen
= EOL_SEEN_CRLF
;
6176 eol_seen
= EOL_SEEN_LF
;
6180 if (++total
== MAX_EOL_CHECK_COUNT
)
6189 adjust_coding_eol_type (struct coding_system
*coding
, int eol_seen
)
6191 Lisp_Object eol_type
;
6193 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6194 if (eol_seen
& EOL_SEEN_LF
)
6196 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
6199 else if (eol_seen
& EOL_SEEN_CRLF
)
6201 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
6204 else if (eol_seen
& EOL_SEEN_CR
)
6206 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
6212 /* Detect how a text specified in CODING is encoded. If a coding
6213 system is detected, update fields of CODING by the detected coding
6217 detect_coding (struct coding_system
*coding
)
6219 const unsigned char *src
, *src_end
;
6220 unsigned int saved_mode
= coding
->mode
;
6222 coding
->consumed
= coding
->consumed_char
= 0;
6223 coding
->produced
= coding
->produced_char
= 0;
6224 coding_set_source (coding
);
6226 src_end
= coding
->source
+ coding
->src_bytes
;
6227 coding
->head_ascii
= 0;
6229 /* If we have not yet decided the text encoding type, detect it
6231 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
6234 struct coding_detection_info detect_info
;
6235 bool null_byte_found
= 0, eight_bit_found
= 0;
6237 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6238 for (src
= coding
->source
; src
< src_end
; src
++)
6243 eight_bit_found
= 1;
6244 if (null_byte_found
)
6249 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
6250 && ! inhibit_iso_escape_detection
6251 && ! detect_info
.checked
)
6253 if (detect_coding_iso_2022 (coding
, &detect_info
))
6255 /* We have scanned the whole data. */
6256 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
6258 /* We didn't find an 8-bit code. We may
6259 have found a null-byte, but it's very
6260 rare that a binary file conforms to
6263 coding
->head_ascii
= src
- coding
->source
;
6265 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
6269 else if (! c
&& !inhibit_null_byte_detection
)
6271 null_byte_found
= 1;
6272 if (eight_bit_found
)
6275 if (! eight_bit_found
)
6276 coding
->head_ascii
++;
6278 else if (! eight_bit_found
)
6279 coding
->head_ascii
++;
6282 if (null_byte_found
|| eight_bit_found
6283 || coding
->head_ascii
< coding
->src_bytes
6284 || detect_info
.found
)
6286 enum coding_category category
;
6287 struct coding_system
*this;
6289 if (coding
->head_ascii
== coding
->src_bytes
)
6290 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6291 for (i
= 0; i
< coding_category_raw_text
; i
++)
6293 category
= coding_priorities
[i
];
6294 this = coding_categories
+ category
;
6295 if (detect_info
.found
& (1 << category
))
6300 if (null_byte_found
)
6302 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
6303 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
6305 for (i
= 0; i
< coding_category_raw_text
; i
++)
6307 category
= coding_priorities
[i
];
6308 this = coding_categories
+ category
;
6309 /* Some of this->detector (e.g. detect_coding_sjis)
6310 require this information. */
6311 coding
->id
= this->id
;
6314 /* No coding system of this category is defined. */
6315 detect_info
.rejected
|= (1 << category
);
6317 else if (category
>= coding_category_raw_text
)
6319 else if (detect_info
.checked
& (1 << category
))
6321 if (detect_info
.found
& (1 << category
))
6324 else if ((*(this->detector
)) (coding
, &detect_info
)
6325 && detect_info
.found
& (1 << category
))
6327 if (category
== coding_category_utf_16_auto
)
6329 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6330 category
= coding_category_utf_16_le
;
6332 category
= coding_category_utf_16_be
;
6339 if (i
< coding_category_raw_text
)
6340 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
6341 else if (null_byte_found
)
6342 setup_coding_system (Qno_conversion
, coding
);
6343 else if ((detect_info
.rejected
& CATEGORY_MASK_ANY
)
6344 == CATEGORY_MASK_ANY
)
6345 setup_coding_system (Qraw_text
, coding
);
6346 else if (detect_info
.rejected
)
6347 for (i
= 0; i
< coding_category_raw_text
; i
++)
6348 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6350 this = coding_categories
+ coding_priorities
[i
];
6351 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
6356 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6357 == coding_category_utf_8_auto
)
6359 Lisp_Object coding_systems
;
6360 struct coding_detection_info detect_info
;
6363 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6364 detect_info
.found
= detect_info
.rejected
= 0;
6365 coding
->head_ascii
= 0;
6366 if (CONSP (coding_systems
)
6367 && detect_coding_utf_8 (coding
, &detect_info
))
6369 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
6370 setup_coding_system (XCAR (coding_systems
), coding
);
6372 setup_coding_system (XCDR (coding_systems
), coding
);
6375 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6376 == coding_category_utf_16_auto
)
6378 Lisp_Object coding_systems
;
6379 struct coding_detection_info detect_info
;
6382 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6383 detect_info
.found
= detect_info
.rejected
= 0;
6384 coding
->head_ascii
= 0;
6385 if (CONSP (coding_systems
)
6386 && detect_coding_utf_16 (coding
, &detect_info
))
6388 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6389 setup_coding_system (XCAR (coding_systems
), coding
);
6390 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6391 setup_coding_system (XCDR (coding_systems
), coding
);
6394 coding
->mode
= saved_mode
;
6399 decode_eol (struct coding_system
*coding
)
6401 Lisp_Object eol_type
;
6402 unsigned char *p
, *pbeg
, *pend
;
6404 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6405 if (EQ (eol_type
, Qunix
) || inhibit_eol_conversion
)
6408 if (NILP (coding
->dst_object
))
6409 pbeg
= coding
->destination
;
6411 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
6412 pend
= pbeg
+ coding
->produced
;
6414 if (VECTORP (eol_type
))
6416 int eol_seen
= EOL_SEEN_NONE
;
6418 for (p
= pbeg
; p
< pend
; p
++)
6421 eol_seen
|= EOL_SEEN_LF
;
6422 else if (*p
== '\r')
6424 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
6426 eol_seen
|= EOL_SEEN_CRLF
;
6430 eol_seen
|= EOL_SEEN_CR
;
6433 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6434 if ((eol_seen
& EOL_SEEN_CRLF
) != 0
6435 && (eol_seen
& EOL_SEEN_CR
) != 0
6436 && (eol_seen
& EOL_SEEN_LF
) == 0)
6437 eol_seen
= EOL_SEEN_CRLF
;
6438 else if (eol_seen
!= EOL_SEEN_NONE
6439 && eol_seen
!= EOL_SEEN_LF
6440 && eol_seen
!= EOL_SEEN_CRLF
6441 && eol_seen
!= EOL_SEEN_CR
)
6442 eol_seen
= EOL_SEEN_LF
;
6443 if (eol_seen
!= EOL_SEEN_NONE
)
6444 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
6447 if (EQ (eol_type
, Qmac
))
6449 for (p
= pbeg
; p
< pend
; p
++)
6453 else if (EQ (eol_type
, Qdos
))
6457 if (NILP (coding
->dst_object
))
6459 /* Start deleting '\r' from the tail to minimize the memory
6461 for (p
= pend
- 2; p
>= pbeg
; p
--)
6464 memmove (p
, p
+ 1, pend
-- - p
- 1);
6470 ptrdiff_t pos_byte
= coding
->dst_pos_byte
;
6471 ptrdiff_t pos
= coding
->dst_pos
;
6472 ptrdiff_t pos_end
= pos
+ coding
->produced_char
- 1;
6474 while (pos
< pos_end
)
6476 p
= BYTE_POS_ADDR (pos_byte
);
6477 if (*p
== '\r' && p
[1] == '\n')
6479 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
6484 if (coding
->dst_multibyte
)
6485 pos_byte
+= BYTES_BY_CHAR_HEAD (*p
);
6490 coding
->produced
-= n
;
6491 coding
->produced_char
-= n
;
6496 /* Return a translation table (or list of them) from coding system
6497 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6501 get_translation_table (Lisp_Object attrs
, bool encodep
, int *max_lookup
)
6503 Lisp_Object standard
, translation_table
;
6506 if (NILP (Venable_character_translation
))
6513 translation_table
= CODING_ATTR_ENCODE_TBL (attrs
),
6514 standard
= Vstandard_translation_table_for_encode
;
6516 translation_table
= CODING_ATTR_DECODE_TBL (attrs
),
6517 standard
= Vstandard_translation_table_for_decode
;
6518 if (NILP (translation_table
))
6519 translation_table
= standard
;
6522 if (SYMBOLP (translation_table
))
6523 translation_table
= Fget (translation_table
, Qtranslation_table
);
6524 else if (CONSP (translation_table
))
6526 translation_table
= Fcopy_sequence (translation_table
);
6527 for (val
= translation_table
; CONSP (val
); val
= XCDR (val
))
6528 if (SYMBOLP (XCAR (val
)))
6529 XSETCAR (val
, Fget (XCAR (val
), Qtranslation_table
));
6531 if (CHAR_TABLE_P (standard
))
6533 if (CONSP (translation_table
))
6534 translation_table
= nconc2 (translation_table
,
6535 Fcons (standard
, Qnil
));
6537 translation_table
= Fcons (translation_table
,
6538 Fcons (standard
, Qnil
));
6545 if (CHAR_TABLE_P (translation_table
)
6546 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table
)) > 1)
6548 val
= XCHAR_TABLE (translation_table
)->extras
[1];
6549 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
6550 *max_lookup
= XFASTINT (val
);
6552 else if (CONSP (translation_table
))
6556 for (tail
= translation_table
; CONSP (tail
); tail
= XCDR (tail
))
6557 if (CHAR_TABLE_P (XCAR (tail
))
6558 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail
))) > 1)
6560 Lisp_Object tailval
= XCHAR_TABLE (XCAR (tail
))->extras
[1];
6561 if (NATNUMP (tailval
) && *max_lookup
< XFASTINT (tailval
))
6562 *max_lookup
= XFASTINT (tailval
);
6566 return translation_table
;
6569 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6572 if (CHAR_TABLE_P (table)) \
6574 trans = CHAR_TABLE_REF (table, c); \
6575 if (CHARACTERP (trans)) \
6576 c = XFASTINT (trans), trans = Qnil; \
6578 else if (CONSP (table)) \
6582 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6583 if (CHAR_TABLE_P (XCAR (tail))) \
6585 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6586 if (CHARACTERP (trans)) \
6587 c = XFASTINT (trans), trans = Qnil; \
6588 else if (! NILP (trans)) \
6595 /* Return a translation of character(s) at BUF according to TRANS.
6596 TRANS is TO-CHAR or ((FROM . TO) ...) where
6597 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6598 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6599 translation is found, and Qnil if not found..
6600 If BUF is too short to lookup characters in FROM, return Qt. */
6603 get_translation (Lisp_Object trans
, int *buf
, int *buf_end
)
6606 if (INTEGERP (trans
))
6608 for (; CONSP (trans
); trans
= XCDR (trans
))
6610 Lisp_Object val
= XCAR (trans
);
6611 Lisp_Object from
= XCAR (val
);
6612 ptrdiff_t len
= ASIZE (from
);
6615 for (i
= 0; i
< len
; i
++)
6617 if (buf
+ i
== buf_end
)
6619 if (XINT (AREF (from
, i
)) != buf
[i
])
6630 produce_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
6633 unsigned char *dst
= coding
->destination
+ coding
->produced
;
6634 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
6636 ptrdiff_t produced_chars
= 0;
6639 if (! coding
->chars_at_source
)
6641 /* Source characters are in coding->charbuf. */
6642 int *buf
= coding
->charbuf
;
6643 int *buf_end
= buf
+ coding
->charbuf_used
;
6645 if (EQ (coding
->src_object
, coding
->dst_object
))
6647 coding_set_source (coding
);
6648 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
6651 while (buf
< buf_end
)
6658 ptrdiff_t from_nchars
= 1, to_nchars
= 1;
6659 Lisp_Object trans
= Qnil
;
6661 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
6664 trans
= get_translation (trans
, buf
, buf_end
);
6665 if (INTEGERP (trans
))
6667 else if (CONSP (trans
))
6669 from_nchars
= ASIZE (XCAR (trans
));
6670 trans
= XCDR (trans
);
6671 if (INTEGERP (trans
))
6675 to_nchars
= ASIZE (trans
);
6676 c
= XINT (AREF (trans
, 0));
6679 else if (EQ (trans
, Qt
) && ! last_block
)
6683 if ((dst_end
- dst
) / MAX_MULTIBYTE_LENGTH
< to_nchars
)
6685 if (((min (PTRDIFF_MAX
, SIZE_MAX
) - (buf_end
- buf
))
6686 / MAX_MULTIBYTE_LENGTH
)
6688 memory_full (SIZE_MAX
);
6689 dst
= alloc_destination (coding
,
6691 + MAX_MULTIBYTE_LENGTH
* to_nchars
,
6693 if (EQ (coding
->src_object
, coding
->dst_object
))
6695 coding_set_source (coding
);
6696 dst_end
= (((unsigned char *) coding
->source
)
6697 + coding
->consumed
);
6700 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6703 for (i
= 0; i
< to_nchars
; i
++)
6706 c
= XINT (AREF (trans
, i
));
6707 if (coding
->dst_multibyte
6708 || ! CHAR_BYTE8_P (c
))
6709 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
6711 *dst
++ = CHAR_TO_BYTE8 (c
);
6713 produced_chars
+= to_nchars
;
6717 /* This is an annotation datum. (-C) is the length. */
6720 carryover
= buf_end
- buf
;
6724 /* Source characters are at coding->source. */
6725 const unsigned char *src
= coding
->source
;
6726 const unsigned char *src_end
= src
+ coding
->consumed
;
6728 if (EQ (coding
->dst_object
, coding
->src_object
))
6729 dst_end
= (unsigned char *) src
;
6730 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
6732 if (coding
->src_multibyte
)
6734 bool multibytep
= 1;
6735 ptrdiff_t consumed_chars
= 0;
6739 const unsigned char *src_base
= src
;
6745 if (EQ (coding
->src_object
, coding
->dst_object
))
6746 dst_end
= (unsigned char *) src
;
6749 ptrdiff_t offset
= src
- coding
->source
;
6751 dst
= alloc_destination (coding
, src_end
- src
+ 1,
6753 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6754 coding_set_source (coding
);
6755 src
= coding
->source
+ offset
;
6756 src_end
= coding
->source
+ coding
->consumed
;
6757 if (EQ (coding
->src_object
, coding
->dst_object
))
6758 dst_end
= (unsigned char *) src
;
6768 while (src
< src_end
)
6770 bool multibytep
= 1;
6773 if (dst
>= dst_end
- 1)
6775 if (EQ (coding
->src_object
, coding
->dst_object
))
6776 dst_end
= (unsigned char *) src
;
6777 if (dst
>= dst_end
- 1)
6779 ptrdiff_t offset
= src
- coding
->source
;
6780 ptrdiff_t more_bytes
;
6782 if (EQ (coding
->src_object
, coding
->dst_object
))
6783 more_bytes
= ((src_end
- src
) / 2) + 2;
6785 more_bytes
= src_end
- src
+ 2;
6786 dst
= alloc_destination (coding
, more_bytes
, dst
);
6787 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6788 coding_set_source (coding
);
6789 src
= coding
->source
+ offset
;
6790 src_end
= coding
->source
+ coding
->consumed
;
6791 if (EQ (coding
->src_object
, coding
->dst_object
))
6792 dst_end
= (unsigned char *) src
;
6800 if (!EQ (coding
->src_object
, coding
->dst_object
))
6802 ptrdiff_t require
= coding
->src_bytes
- coding
->dst_bytes
;
6806 ptrdiff_t offset
= src
- coding
->source
;
6808 dst
= alloc_destination (coding
, require
, dst
);
6809 coding_set_source (coding
);
6810 src
= coding
->source
+ offset
;
6811 src_end
= coding
->source
+ coding
->consumed
;
6814 produced_chars
= coding
->consumed_char
;
6815 while (src
< src_end
)
6820 produced
= dst
- (coding
->destination
+ coding
->produced
);
6821 if (BUFFERP (coding
->dst_object
) && produced_chars
> 0)
6822 insert_from_gap (produced_chars
, produced
);
6823 coding
->produced
+= produced
;
6824 coding
->produced_char
+= produced_chars
;
6828 /* Compose text in CODING->object according to the annotation data at
6829 CHARBUF. CHARBUF is an array:
6830 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6834 produce_composition (struct coding_system
*coding
, int *charbuf
, ptrdiff_t pos
)
6838 enum composition_method method
;
6839 Lisp_Object components
;
6841 len
= -charbuf
[0] - MAX_ANNOTATION_LENGTH
;
6842 to
= pos
+ charbuf
[2];
6843 method
= (enum composition_method
) (charbuf
[4]);
6845 if (method
== COMPOSITION_RELATIVE
)
6849 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
6852 if (method
== COMPOSITION_WITH_RULE
)
6853 len
= charbuf
[2] * 3 - 2;
6854 charbuf
+= MAX_ANNOTATION_LENGTH
;
6855 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6856 for (i
= j
= 0; i
< len
&& charbuf
[i
] != -1; i
++, j
++)
6858 if (charbuf
[i
] >= 0)
6859 args
[j
] = make_number (charbuf
[i
]);
6863 args
[j
] = make_number (charbuf
[i
] % 0x100);
6866 components
= (i
== j
? Fstring (j
, args
) : Fvector (j
, args
));
6868 compose_text (pos
, to
, components
, Qnil
, coding
->dst_object
);
6872 /* Put `charset' property on text in CODING->object according to
6873 the annotation data at CHARBUF. CHARBUF is an array:
6874 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6878 produce_charset (struct coding_system
*coding
, int *charbuf
, ptrdiff_t pos
)
6880 ptrdiff_t from
= pos
- charbuf
[2];
6881 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[3]);
6883 Fput_text_property (make_number (from
), make_number (pos
),
6884 Qcharset
, CHARSET_NAME (charset
),
6885 coding
->dst_object
);
6889 #define CHARBUF_SIZE 0x4000
6891 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6893 int size = CHARBUF_SIZE; \
6895 coding->charbuf = NULL; \
6896 while (size > 1024) \
6898 coding->charbuf = alloca (sizeof (int) * size); \
6899 if (coding->charbuf) \
6903 if (! coding->charbuf) \
6905 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6908 coding->charbuf_size = size; \
6913 produce_annotation (struct coding_system
*coding
, ptrdiff_t pos
)
6915 int *charbuf
= coding
->charbuf
;
6916 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
6918 if (NILP (coding
->dst_object
))
6921 while (charbuf
< charbuf_end
)
6927 int len
= -*charbuf
;
6932 case CODING_ANNOTATE_COMPOSITION_MASK
:
6933 produce_composition (coding
, charbuf
, pos
);
6935 case CODING_ANNOTATE_CHARSET_MASK
:
6936 produce_charset (coding
, charbuf
, pos
);
6944 /* Decode the data at CODING->src_object into CODING->dst_object.
6945 CODING->src_object is a buffer, a string, or nil.
6946 CODING->dst_object is a buffer.
6948 If CODING->src_object is a buffer, it must be the current buffer.
6949 In this case, if CODING->src_pos is positive, it is a position of
6950 the source text in the buffer, otherwise, the source text is in the
6951 gap area of the buffer, and CODING->src_pos specifies the offset of
6952 the text from GPT (which must be the same as PT). If this is the
6953 same buffer as CODING->dst_object, CODING->src_pos must be
6956 If CODING->src_object is a string, CODING->src_pos is an index to
6959 If CODING->src_object is nil, CODING->source must already point to
6960 the non-relocatable memory area. In this case, CODING->src_pos is
6961 an offset from CODING->source.
6963 The decoded data is inserted at the current point of the buffer
6968 decode_coding (struct coding_system
*coding
)
6971 Lisp_Object undo_list
;
6972 Lisp_Object translation_table
;
6973 struct ccl_spec cclspec
;
6977 if (BUFFERP (coding
->src_object
)
6978 && coding
->src_pos
> 0
6979 && coding
->src_pos
< GPT
6980 && coding
->src_pos
+ coding
->src_chars
> GPT
)
6981 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
6984 if (BUFFERP (coding
->dst_object
))
6986 set_buffer_internal (XBUFFER (coding
->dst_object
));
6988 move_gap_both (PT
, PT_BYTE
);
6990 /* We must disable undo_list in order to record the whole insert
6991 transaction via record_insert at the end. But doing so also
6992 disables the recording of the first change to the undo_list.
6993 Therefore we check for first change here and record it via
6994 record_first_change if needed. */
6995 if (MODIFF
<= SAVE_MODIFF
)
6996 record_first_change ();
6998 undo_list
= BVAR (current_buffer
, undo_list
);
6999 bset_undo_list (current_buffer
, Qt
);
7002 coding
->consumed
= coding
->consumed_char
= 0;
7003 coding
->produced
= coding
->produced_char
= 0;
7004 coding
->chars_at_source
= 0;
7005 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7008 ALLOC_CONVERSION_WORK_AREA (coding
);
7010 attrs
= CODING_ID_ATTRS (coding
->id
);
7011 translation_table
= get_translation_table (attrs
, 0, NULL
);
7014 if (coding
->decoder
== decode_coding_ccl
)
7016 coding
->spec
.ccl
= &cclspec
;
7017 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_DECODER (coding
));
7021 ptrdiff_t pos
= coding
->dst_pos
+ coding
->produced_char
;
7023 coding_set_source (coding
);
7024 coding
->annotated
= 0;
7025 coding
->charbuf_used
= carryover
;
7026 (*(coding
->decoder
)) (coding
);
7027 coding_set_destination (coding
);
7028 carryover
= produce_chars (coding
, translation_table
, 0);
7029 if (coding
->annotated
)
7030 produce_annotation (coding
, pos
);
7031 for (i
= 0; i
< carryover
; i
++)
7033 = coding
->charbuf
[coding
->charbuf_used
- carryover
+ i
];
7035 while (coding
->result
== CODING_RESULT_INSUFFICIENT_DST
7036 || (coding
->consumed
< coding
->src_bytes
7037 && (coding
->result
== CODING_RESULT_SUCCESS
7038 || coding
->result
== CODING_RESULT_INVALID_SRC
)));
7042 coding_set_destination (coding
);
7043 coding
->charbuf_used
= carryover
;
7044 produce_chars (coding
, translation_table
, 1);
7047 coding
->carryover_bytes
= 0;
7048 if (coding
->consumed
< coding
->src_bytes
)
7050 int nbytes
= coding
->src_bytes
- coding
->consumed
;
7051 const unsigned char *src
;
7053 coding_set_source (coding
);
7054 coding_set_destination (coding
);
7055 src
= coding
->source
+ coding
->consumed
;
7057 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
7059 /* Flush out unprocessed data as binary chars. We are sure
7060 that the number of data is less than the size of
7062 coding
->charbuf_used
= 0;
7063 coding
->chars_at_source
= 0;
7065 while (nbytes
-- > 0)
7070 c
= BYTE8_TO_CHAR (c
);
7071 coding
->charbuf
[coding
->charbuf_used
++] = c
;
7073 produce_chars (coding
, Qnil
, 1);
7077 /* Record unprocessed bytes in coding->carryover. We are
7078 sure that the number of data is less than the size of
7079 coding->carryover. */
7080 unsigned char *p
= coding
->carryover
;
7082 if (nbytes
> sizeof coding
->carryover
)
7083 nbytes
= sizeof coding
->carryover
;
7084 coding
->carryover_bytes
= nbytes
;
7085 while (nbytes
-- > 0)
7088 coding
->consumed
= coding
->src_bytes
;
7091 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
)
7092 && !inhibit_eol_conversion
)
7093 decode_eol (coding
);
7094 if (BUFFERP (coding
->dst_object
))
7096 bset_undo_list (current_buffer
, undo_list
);
7097 record_insert (coding
->dst_pos
, coding
->produced_char
);
7102 /* Extract an annotation datum from a composition starting at POS and
7103 ending before LIMIT of CODING->src_object (buffer or string), store
7104 the data in BUF, set *STOP to a starting position of the next
7105 composition (if any) or to LIMIT, and return the address of the
7106 next element of BUF.
7108 If such an annotation is not found, set *STOP to a starting
7109 position of a composition after POS (if any) or to LIMIT, and
7113 handle_composition_annotation (ptrdiff_t pos
, ptrdiff_t limit
,
7114 struct coding_system
*coding
, int *buf
,
7117 ptrdiff_t start
, end
;
7120 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
7123 else if (start
> pos
)
7129 /* We found a composition. Store the corresponding
7130 annotation data in BUF. */
7132 enum composition_method method
= COMPOSITION_METHOD (prop
);
7133 int nchars
= COMPOSITION_LENGTH (prop
);
7135 ADD_COMPOSITION_DATA (buf
, nchars
, 0, method
);
7136 if (method
!= COMPOSITION_RELATIVE
)
7138 Lisp_Object components
;
7139 ptrdiff_t i
, len
, i_byte
;
7141 components
= COMPOSITION_COMPONENTS (prop
);
7142 if (VECTORP (components
))
7144 len
= ASIZE (components
);
7145 for (i
= 0; i
< len
; i
++)
7146 *buf
++ = XINT (AREF (components
, i
));
7148 else if (STRINGP (components
))
7150 len
= SCHARS (components
);
7154 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
7158 else if (INTEGERP (components
))
7161 *buf
++ = XINT (components
);
7163 else if (CONSP (components
))
7165 for (len
= 0; CONSP (components
);
7166 len
++, components
= XCDR (components
))
7167 *buf
++ = XINT (XCAR (components
));
7175 if (find_composition (end
, limit
, &start
, &end
, &prop
,
7186 /* Extract an annotation datum from a text property `charset' at POS of
7187 CODING->src_object (buffer of string), store the data in BUF, set
7188 *STOP to the position where the value of `charset' property changes
7189 (limiting by LIMIT), and return the address of the next element of
7192 If the property value is nil, set *STOP to the position where the
7193 property value is non-nil (limiting by LIMIT), and return BUF. */
7196 handle_charset_annotation (ptrdiff_t pos
, ptrdiff_t limit
,
7197 struct coding_system
*coding
, int *buf
,
7200 Lisp_Object val
, next
;
7203 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
7204 if (! NILP (val
) && CHARSETP (val
))
7205 id
= XINT (CHARSET_SYMBOL_ID (val
));
7208 ADD_CHARSET_DATA (buf
, 0, id
);
7209 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
7211 make_number (limit
));
7212 *stop
= XINT (next
);
7218 consume_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
7221 int *buf
= coding
->charbuf
;
7222 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
7223 const unsigned char *src
= coding
->source
+ coding
->consumed
;
7224 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
7225 ptrdiff_t pos
= coding
->src_pos
+ coding
->consumed_char
;
7226 ptrdiff_t end_pos
= coding
->src_pos
+ coding
->src_chars
;
7227 bool multibytep
= coding
->src_multibyte
;
7228 Lisp_Object eol_type
;
7230 ptrdiff_t stop
, stop_composition
, stop_charset
;
7231 int *lookup_buf
= NULL
;
7233 if (! NILP (translation_table
))
7234 lookup_buf
= alloca (sizeof (int) * max_lookup
);
7236 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
7237 if (VECTORP (eol_type
))
7240 /* Note: composition handling is not yet implemented. */
7241 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7243 if (NILP (coding
->src_object
))
7244 stop
= stop_composition
= stop_charset
= end_pos
;
7247 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
7248 stop
= stop_composition
= pos
;
7250 stop
= stop_composition
= end_pos
;
7251 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
7252 stop
= stop_charset
= pos
;
7254 stop_charset
= end_pos
;
7257 /* Compensate for CRLF and conversion. */
7258 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
7259 while (buf
< buf_end
)
7267 if (pos
== stop_composition
)
7268 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
7269 buf
, &stop_composition
);
7270 if (pos
== stop_charset
)
7271 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
7272 buf
, &stop_charset
);
7273 stop
= (stop_composition
< stop_charset
7274 ? stop_composition
: stop_charset
);
7281 if (coding
->encoder
== encode_coding_raw_text
7282 || coding
->encoder
== encode_coding_ccl
)
7284 else if ((bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
7285 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
+= bytes
;
7287 c
= BYTE8_TO_CHAR (*src
), src
++, pos
++;
7290 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
++;
7291 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
7293 if (! EQ (eol_type
, Qunix
))
7297 if (EQ (eol_type
, Qdos
))
7305 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
7310 ptrdiff_t from_nchars
= 1, to_nchars
= 1;
7311 int *lookup_buf_end
;
7312 const unsigned char *p
= src
;
7316 for (i
= 1; i
< max_lookup
&& p
< src_end
; i
++)
7317 lookup_buf
[i
] = STRING_CHAR_ADVANCE (p
);
7318 lookup_buf_end
= lookup_buf
+ i
;
7319 trans
= get_translation (trans
, lookup_buf
, lookup_buf_end
);
7320 if (INTEGERP (trans
))
7322 else if (CONSP (trans
))
7324 from_nchars
= ASIZE (XCAR (trans
));
7325 trans
= XCDR (trans
);
7326 if (INTEGERP (trans
))
7330 to_nchars
= ASIZE (trans
);
7331 if (buf_end
- buf
< to_nchars
)
7333 c
= XINT (AREF (trans
, 0));
7339 for (i
= 1; i
< to_nchars
; i
++)
7340 *buf
++ = XINT (AREF (trans
, i
));
7341 for (i
= 1; i
< from_nchars
; i
++, pos
++)
7342 src
+= MULTIBYTE_LENGTH_NO_CHECK (src
);
7346 coding
->consumed
= src
- coding
->source
;
7347 coding
->consumed_char
= pos
- coding
->src_pos
;
7348 coding
->charbuf_used
= buf
- coding
->charbuf
;
7349 coding
->chars_at_source
= 0;
7353 /* Encode the text at CODING->src_object into CODING->dst_object.
7354 CODING->src_object is a buffer or a string.
7355 CODING->dst_object is a buffer or nil.
7357 If CODING->src_object is a buffer, it must be the current buffer.
7358 In this case, if CODING->src_pos is positive, it is a position of
7359 the source text in the buffer, otherwise. the source text is in the
7360 gap area of the buffer, and coding->src_pos specifies the offset of
7361 the text from GPT (which must be the same as PT). If this is the
7362 same buffer as CODING->dst_object, CODING->src_pos must be
7363 negative and CODING should not have `pre-write-conversion'.
7365 If CODING->src_object is a string, CODING should not have
7366 `pre-write-conversion'.
7368 If CODING->dst_object is a buffer, the encoded data is inserted at
7369 the current point of that buffer.
7371 If CODING->dst_object is nil, the encoded data is placed at the
7372 memory area specified by CODING->destination. */
7375 encode_coding (struct coding_system
*coding
)
7378 Lisp_Object translation_table
;
7380 struct ccl_spec cclspec
;
7382 attrs
= CODING_ID_ATTRS (coding
->id
);
7383 if (coding
->encoder
== encode_coding_raw_text
)
7384 translation_table
= Qnil
, max_lookup
= 0;
7386 translation_table
= get_translation_table (attrs
, 1, &max_lookup
);
7388 if (BUFFERP (coding
->dst_object
))
7390 set_buffer_internal (XBUFFER (coding
->dst_object
));
7391 coding
->dst_multibyte
7392 = ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7395 coding
->consumed
= coding
->consumed_char
= 0;
7396 coding
->produced
= coding
->produced_char
= 0;
7397 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7400 ALLOC_CONVERSION_WORK_AREA (coding
);
7402 if (coding
->encoder
== encode_coding_ccl
)
7404 coding
->spec
.ccl
= &cclspec
;
7405 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_ENCODER (coding
));
7408 coding_set_source (coding
);
7409 consume_chars (coding
, translation_table
, max_lookup
);
7410 coding_set_destination (coding
);
7411 (*(coding
->encoder
)) (coding
);
7412 } while (coding
->consumed_char
< coding
->src_chars
);
7414 if (BUFFERP (coding
->dst_object
) && coding
->produced_char
> 0)
7415 insert_from_gap (coding
->produced_char
, coding
->produced
);
7419 /* Name (or base name) of work buffer for code conversion. */
7420 static Lisp_Object Vcode_conversion_workbuf_name
;
7422 /* A working buffer used by the top level conversion. Once it is
7423 created, it is never destroyed. It has the name
7424 Vcode_conversion_workbuf_name. The other working buffers are
7425 destroyed after the use is finished, and their names are modified
7426 versions of Vcode_conversion_workbuf_name. */
7427 static Lisp_Object Vcode_conversion_reused_workbuf
;
7429 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7430 static bool reused_workbuf_in_use
;
7433 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7434 multibyteness of returning buffer. */
7437 make_conversion_work_buffer (bool multibyte
)
7439 Lisp_Object name
, workbuf
;
7440 struct buffer
*current
;
7442 if (reused_workbuf_in_use
)
7444 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
7445 workbuf
= Fget_buffer_create (name
);
7449 reused_workbuf_in_use
= 1;
7450 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf
)))
7451 Vcode_conversion_reused_workbuf
7452 = Fget_buffer_create (Vcode_conversion_workbuf_name
);
7453 workbuf
= Vcode_conversion_reused_workbuf
;
7455 current
= current_buffer
;
7456 set_buffer_internal (XBUFFER (workbuf
));
7457 /* We can't allow modification hooks to run in the work buffer. For
7458 instance, directory_files_internal assumes that file decoding
7459 doesn't compile new regexps. */
7460 Fset (Fmake_local_variable (Qinhibit_modification_hooks
), Qt
);
7462 bset_undo_list (current_buffer
, Qt
);
7463 bset_enable_multibyte_characters (current_buffer
, multibyte
? Qt
: Qnil
);
7464 set_buffer_internal (current
);
7470 code_conversion_restore (Lisp_Object arg
)
7472 Lisp_Object current
, workbuf
;
7473 struct gcpro gcpro1
;
7476 current
= XCAR (arg
);
7477 workbuf
= XCDR (arg
);
7478 if (! NILP (workbuf
))
7480 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
7481 reused_workbuf_in_use
= 0;
7483 Fkill_buffer (workbuf
);
7485 set_buffer_internal (XBUFFER (current
));
7491 code_conversion_save (bool with_work_buf
, bool multibyte
)
7493 Lisp_Object workbuf
= Qnil
;
7496 workbuf
= make_conversion_work_buffer (multibyte
);
7497 record_unwind_protect (code_conversion_restore
,
7498 Fcons (Fcurrent_buffer (), workbuf
));
7503 decode_coding_gap (struct coding_system
*coding
,
7504 ptrdiff_t chars
, ptrdiff_t bytes
)
7506 ptrdiff_t count
= SPECPDL_INDEX ();
7509 code_conversion_save (0, 0);
7511 coding
->src_object
= Fcurrent_buffer ();
7512 coding
->src_chars
= chars
;
7513 coding
->src_bytes
= bytes
;
7514 coding
->src_pos
= -chars
;
7515 coding
->src_pos_byte
= -bytes
;
7516 coding
->src_multibyte
= chars
< bytes
;
7517 coding
->dst_object
= coding
->src_object
;
7518 coding
->dst_pos
= PT
;
7519 coding
->dst_pos_byte
= PT_BYTE
;
7520 coding
->dst_multibyte
= ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7522 if (CODING_REQUIRE_DETECTION (coding
))
7523 detect_coding (coding
);
7525 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
7526 current_buffer
->text
->inhibit_shrinking
= 1;
7527 decode_coding (coding
);
7528 current_buffer
->text
->inhibit_shrinking
= 0;
7530 attrs
= CODING_ID_ATTRS (coding
->id
);
7531 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
7533 ptrdiff_t prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
7536 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
7537 val
= call1 (CODING_ATTR_POST_READ (attrs
),
7538 make_number (coding
->produced_char
));
7540 coding
->produced_char
+= Z
- prev_Z
;
7541 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
7544 unbind_to (count
, Qnil
);
7548 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7549 SRC_OBJECT into DST_OBJECT by coding context CODING.
7551 SRC_OBJECT is a buffer, a string, or Qnil.
7553 If it is a buffer, the text is at point of the buffer. FROM and TO
7554 are positions in the buffer.
7556 If it is a string, the text is at the beginning of the string.
7557 FROM and TO are indices to the string.
7559 If it is nil, the text is at coding->source. FROM and TO are
7560 indices to coding->source.
7562 DST_OBJECT is a buffer, Qt, or Qnil.
7564 If it is a buffer, the decoded text is inserted at point of the
7565 buffer. If the buffer is the same as SRC_OBJECT, the source text
7568 If it is Qt, a string is made from the decoded text, and
7569 set in CODING->dst_object.
7571 If it is Qnil, the decoded text is stored at CODING->destination.
7572 The caller must allocate CODING->dst_bytes bytes at
7573 CODING->destination by xmalloc. If the decoded text is longer than
7574 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7578 decode_coding_object (struct coding_system
*coding
,
7579 Lisp_Object src_object
,
7580 ptrdiff_t from
, ptrdiff_t from_byte
,
7581 ptrdiff_t to
, ptrdiff_t to_byte
,
7582 Lisp_Object dst_object
)
7584 ptrdiff_t count
= SPECPDL_INDEX ();
7585 unsigned char *destination
IF_LINT (= NULL
);
7586 ptrdiff_t dst_bytes
IF_LINT (= 0);
7587 ptrdiff_t chars
= to
- from
;
7588 ptrdiff_t bytes
= to_byte
- from_byte
;
7590 ptrdiff_t saved_pt
= -1, saved_pt_byte
IF_LINT (= 0);
7591 bool need_marker_adjustment
= 0;
7592 Lisp_Object old_deactivate_mark
;
7594 old_deactivate_mark
= Vdeactivate_mark
;
7596 if (NILP (dst_object
))
7598 destination
= coding
->destination
;
7599 dst_bytes
= coding
->dst_bytes
;
7602 coding
->src_object
= src_object
;
7603 coding
->src_chars
= chars
;
7604 coding
->src_bytes
= bytes
;
7605 coding
->src_multibyte
= chars
< bytes
;
7607 if (STRINGP (src_object
))
7609 coding
->src_pos
= from
;
7610 coding
->src_pos_byte
= from_byte
;
7612 else if (BUFFERP (src_object
))
7614 set_buffer_internal (XBUFFER (src_object
));
7616 move_gap_both (from
, from_byte
);
7617 if (EQ (src_object
, dst_object
))
7619 struct Lisp_Marker
*tail
;
7621 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7623 tail
->need_adjustment
7624 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
7625 need_marker_adjustment
|= tail
->need_adjustment
;
7627 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7628 TEMP_SET_PT_BOTH (from
, from_byte
);
7629 current_buffer
->text
->inhibit_shrinking
= 1;
7630 del_range_both (from
, from_byte
, to
, to_byte
, 1);
7631 coding
->src_pos
= -chars
;
7632 coding
->src_pos_byte
= -bytes
;
7636 coding
->src_pos
= from
;
7637 coding
->src_pos_byte
= from_byte
;
7641 if (CODING_REQUIRE_DETECTION (coding
))
7642 detect_coding (coding
);
7643 attrs
= CODING_ID_ATTRS (coding
->id
);
7645 if (EQ (dst_object
, Qt
)
7646 || (! NILP (CODING_ATTR_POST_READ (attrs
))
7647 && NILP (dst_object
)))
7649 coding
->dst_multibyte
= !CODING_FOR_UNIBYTE (coding
);
7650 coding
->dst_object
= code_conversion_save (1, coding
->dst_multibyte
);
7651 coding
->dst_pos
= BEG
;
7652 coding
->dst_pos_byte
= BEG_BYTE
;
7654 else if (BUFFERP (dst_object
))
7656 code_conversion_save (0, 0);
7657 coding
->dst_object
= dst_object
;
7658 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
7659 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
7660 coding
->dst_multibyte
7661 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
7665 code_conversion_save (0, 0);
7666 coding
->dst_object
= Qnil
;
7667 /* Most callers presume this will return a multibyte result, and they
7668 won't use `binary' or `raw-text' anyway, so let's not worry about
7669 CODING_FOR_UNIBYTE. */
7670 coding
->dst_multibyte
= 1;
7673 decode_coding (coding
);
7675 if (BUFFERP (coding
->dst_object
))
7676 set_buffer_internal (XBUFFER (coding
->dst_object
));
7678 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
7680 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
7681 ptrdiff_t prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
7684 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
7685 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
7686 old_deactivate_mark
);
7687 val
= safe_call1 (CODING_ATTR_POST_READ (attrs
),
7688 make_number (coding
->produced_char
));
7691 coding
->produced_char
+= Z
- prev_Z
;
7692 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
7695 if (EQ (dst_object
, Qt
))
7697 coding
->dst_object
= Fbuffer_string ();
7699 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
7701 set_buffer_internal (XBUFFER (coding
->dst_object
));
7702 if (dst_bytes
< coding
->produced
)
7704 destination
= xrealloc (destination
, coding
->produced
);
7707 record_conversion_result (coding
,
7708 CODING_RESULT_INSUFFICIENT_MEM
);
7709 unbind_to (count
, Qnil
);
7712 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
7713 move_gap_both (BEGV
, BEGV_BYTE
);
7714 memcpy (destination
, BEGV_ADDR
, coding
->produced
);
7715 coding
->destination
= destination
;
7721 /* This is the case of:
7722 (BUFFERP (src_object) && EQ (src_object, dst_object))
7723 As we have moved PT while replacing the original buffer
7724 contents, we must recover it now. */
7725 set_buffer_internal (XBUFFER (src_object
));
7726 current_buffer
->text
->inhibit_shrinking
= 0;
7727 if (saved_pt
< from
)
7728 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
7729 else if (saved_pt
< from
+ chars
)
7730 TEMP_SET_PT_BOTH (from
, from_byte
);
7731 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
7732 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
7733 saved_pt_byte
+ (coding
->produced
- bytes
));
7735 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
7736 saved_pt_byte
+ (coding
->produced
- bytes
));
7738 if (need_marker_adjustment
)
7740 struct Lisp_Marker
*tail
;
7742 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7743 if (tail
->need_adjustment
)
7745 tail
->need_adjustment
= 0;
7746 if (tail
->insertion_type
)
7748 tail
->bytepos
= from_byte
;
7749 tail
->charpos
= from
;
7753 tail
->bytepos
= from_byte
+ coding
->produced
;
7755 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
7756 ? tail
->bytepos
: from
+ coding
->produced_char
);
7762 Vdeactivate_mark
= old_deactivate_mark
;
7763 unbind_to (count
, coding
->dst_object
);
7768 encode_coding_object (struct coding_system
*coding
,
7769 Lisp_Object src_object
,
7770 ptrdiff_t from
, ptrdiff_t from_byte
,
7771 ptrdiff_t to
, ptrdiff_t to_byte
,
7772 Lisp_Object dst_object
)
7774 ptrdiff_t count
= SPECPDL_INDEX ();
7775 ptrdiff_t chars
= to
- from
;
7776 ptrdiff_t bytes
= to_byte
- from_byte
;
7778 ptrdiff_t saved_pt
= -1, saved_pt_byte
IF_LINT (= 0);
7779 bool need_marker_adjustment
= 0;
7780 bool kill_src_buffer
= 0;
7781 Lisp_Object old_deactivate_mark
;
7783 old_deactivate_mark
= Vdeactivate_mark
;
7785 coding
->src_object
= src_object
;
7786 coding
->src_chars
= chars
;
7787 coding
->src_bytes
= bytes
;
7788 coding
->src_multibyte
= chars
< bytes
;
7790 attrs
= CODING_ID_ATTRS (coding
->id
);
7792 if (EQ (src_object
, dst_object
))
7794 struct Lisp_Marker
*tail
;
7796 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7798 tail
->need_adjustment
7799 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
7800 need_marker_adjustment
|= tail
->need_adjustment
;
7804 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
7806 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
7807 set_buffer_internal (XBUFFER (coding
->src_object
));
7808 if (STRINGP (src_object
))
7809 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
7810 else if (BUFFERP (src_object
))
7811 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
7813 insert_1_both ((char *) coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
7815 if (EQ (src_object
, dst_object
))
7817 set_buffer_internal (XBUFFER (src_object
));
7818 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7819 del_range_both (from
, from_byte
, to
, to_byte
, 1);
7820 set_buffer_internal (XBUFFER (coding
->src_object
));
7824 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
7826 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
7827 old_deactivate_mark
);
7828 safe_call2 (CODING_ATTR_PRE_WRITE (attrs
),
7829 make_number (BEG
), make_number (Z
));
7832 if (XBUFFER (coding
->src_object
) != current_buffer
)
7833 kill_src_buffer
= 1;
7834 coding
->src_object
= Fcurrent_buffer ();
7836 move_gap_both (BEG
, BEG_BYTE
);
7837 coding
->src_chars
= Z
- BEG
;
7838 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
7839 coding
->src_pos
= BEG
;
7840 coding
->src_pos_byte
= BEG_BYTE
;
7841 coding
->src_multibyte
= Z
< Z_BYTE
;
7843 else if (STRINGP (src_object
))
7845 code_conversion_save (0, 0);
7846 coding
->src_pos
= from
;
7847 coding
->src_pos_byte
= from_byte
;
7849 else if (BUFFERP (src_object
))
7851 code_conversion_save (0, 0);
7852 set_buffer_internal (XBUFFER (src_object
));
7853 if (EQ (src_object
, dst_object
))
7855 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7856 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
7857 coding
->src_pos
= 0;
7858 coding
->src_pos_byte
= 0;
7862 if (from
< GPT
&& to
>= GPT
)
7863 move_gap_both (from
, from_byte
);
7864 coding
->src_pos
= from
;
7865 coding
->src_pos_byte
= from_byte
;
7869 code_conversion_save (0, 0);
7871 if (BUFFERP (dst_object
))
7873 coding
->dst_object
= dst_object
;
7874 if (EQ (src_object
, dst_object
))
7876 coding
->dst_pos
= from
;
7877 coding
->dst_pos_byte
= from_byte
;
7881 struct buffer
*current
= current_buffer
;
7883 set_buffer_temp (XBUFFER (dst_object
));
7884 coding
->dst_pos
= PT
;
7885 coding
->dst_pos_byte
= PT_BYTE
;
7886 move_gap_both (coding
->dst_pos
, coding
->dst_pos_byte
);
7887 set_buffer_temp (current
);
7889 coding
->dst_multibyte
7890 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
7892 else if (EQ (dst_object
, Qt
))
7894 ptrdiff_t dst_bytes
= max (1, coding
->src_chars
);
7895 coding
->dst_object
= Qnil
;
7896 coding
->destination
= xmalloc (dst_bytes
);
7897 coding
->dst_bytes
= dst_bytes
;
7898 coding
->dst_multibyte
= 0;
7902 coding
->dst_object
= Qnil
;
7903 coding
->dst_multibyte
= 0;
7906 encode_coding (coding
);
7908 if (EQ (dst_object
, Qt
))
7910 if (BUFFERP (coding
->dst_object
))
7911 coding
->dst_object
= Fbuffer_string ();
7915 = make_unibyte_string ((char *) coding
->destination
,
7917 xfree (coding
->destination
);
7923 /* This is the case of:
7924 (BUFFERP (src_object) && EQ (src_object, dst_object))
7925 As we have moved PT while replacing the original buffer
7926 contents, we must recover it now. */
7927 set_buffer_internal (XBUFFER (src_object
));
7928 if (saved_pt
< from
)
7929 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
7930 else if (saved_pt
< from
+ chars
)
7931 TEMP_SET_PT_BOTH (from
, from_byte
);
7932 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
7933 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
7934 saved_pt_byte
+ (coding
->produced
- bytes
));
7936 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
7937 saved_pt_byte
+ (coding
->produced
- bytes
));
7939 if (need_marker_adjustment
)
7941 struct Lisp_Marker
*tail
;
7943 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7944 if (tail
->need_adjustment
)
7946 tail
->need_adjustment
= 0;
7947 if (tail
->insertion_type
)
7949 tail
->bytepos
= from_byte
;
7950 tail
->charpos
= from
;
7954 tail
->bytepos
= from_byte
+ coding
->produced
;
7956 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
7957 ? tail
->bytepos
: from
+ coding
->produced_char
);
7963 if (kill_src_buffer
)
7964 Fkill_buffer (coding
->src_object
);
7966 Vdeactivate_mark
= old_deactivate_mark
;
7967 unbind_to (count
, Qnil
);
7972 preferred_coding_system (void)
7974 int id
= coding_categories
[coding_priorities
[0]].id
;
7976 return CODING_ID_NAME (id
);
7979 #if defined (WINDOWSNT) || defined (CYGWIN)
7982 from_unicode (Lisp_Object str
)
7985 if (!STRING_MULTIBYTE (str
) &&
7988 str
= Fsubstring (str
, make_number (0), make_number (-1));
7991 return code_convert_string_norecord (str
, Qutf_16le
, 0);
7995 to_unicode (Lisp_Object str
, Lisp_Object
*buf
)
7997 *buf
= code_convert_string_norecord (str
, Qutf_16le
, 1);
7998 /* We need to make a another copy (in addition to the one made by
7999 code_convert_string_norecord) to ensure that the final string is
8000 _doubly_ zero terminated --- that is, that the string is
8001 terminated by two zero bytes and one utf-16le null character.
8002 Because strings are already terminated with a single zero byte,
8003 we just add one additional zero. */
8004 str
= make_uninit_string (SBYTES (*buf
) + 1);
8005 memcpy (SDATA (str
), SDATA (*buf
), SBYTES (*buf
));
8006 SDATA (str
) [SBYTES (*buf
)] = '\0';
8008 return WCSDATA (*buf
);
8011 #endif /* WINDOWSNT || CYGWIN */
8015 /*** 8. Emacs Lisp library functions ***/
8017 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
8018 doc
: /* Return t if OBJECT is nil or a coding-system.
8019 See the documentation of `define-coding-system' for information
8020 about coding-system objects. */)
8021 (Lisp_Object object
)
8024 || CODING_SYSTEM_ID (object
) >= 0)
8026 if (! SYMBOLP (object
)
8027 || NILP (Fget (object
, Qcoding_system_define_form
)))
8032 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
8033 Sread_non_nil_coding_system
, 1, 1, 0,
8034 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8035 (Lisp_Object prompt
)
8040 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8041 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
8043 while (SCHARS (val
) == 0);
8044 return (Fintern (val
, Qnil
));
8047 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
8048 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8049 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8050 Ignores case when completing coding systems (all Emacs coding systems
8051 are lower-case). */)
8052 (Lisp_Object prompt
, Lisp_Object default_coding_system
)
8055 ptrdiff_t count
= SPECPDL_INDEX ();
8057 if (SYMBOLP (default_coding_system
))
8058 default_coding_system
= SYMBOL_NAME (default_coding_system
);
8059 specbind (Qcompletion_ignore_case
, Qt
);
8060 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8061 Qt
, Qnil
, Qcoding_system_history
,
8062 default_coding_system
, Qnil
);
8063 unbind_to (count
, Qnil
);
8064 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
8067 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
8069 doc
: /* Check validity of CODING-SYSTEM.
8070 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8071 It is valid if it is nil or a symbol defined as a coding system by the
8072 function `define-coding-system'. */)
8073 (Lisp_Object coding_system
)
8075 Lisp_Object define_form
;
8077 define_form
= Fget (coding_system
, Qcoding_system_define_form
);
8078 if (! NILP (define_form
))
8080 Fput (coding_system
, Qcoding_system_define_form
, Qnil
);
8081 safe_eval (define_form
);
8083 if (!NILP (Fcoding_system_p (coding_system
)))
8084 return coding_system
;
8085 xsignal1 (Qcoding_system_error
, coding_system
);
8089 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8090 HIGHEST, return the coding system of the highest
8091 priority among the detected coding systems. Otherwise return a
8092 list of detected coding systems sorted by their priorities. If
8093 MULTIBYTEP, it is assumed that the bytes are in correct
8094 multibyte form but contains only ASCII and eight-bit chars.
8095 Otherwise, the bytes are raw bytes.
8097 CODING-SYSTEM controls the detection as below:
8099 If it is nil, detect both text-format and eol-format. If the
8100 text-format part of CODING-SYSTEM is already specified
8101 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8102 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8103 detect only text-format. */
8106 detect_coding_system (const unsigned char *src
,
8107 ptrdiff_t src_chars
, ptrdiff_t src_bytes
,
8108 bool highest
, bool multibytep
,
8109 Lisp_Object coding_system
)
8111 const unsigned char *src_end
= src
+ src_bytes
;
8112 Lisp_Object attrs
, eol_type
;
8113 Lisp_Object val
= Qnil
;
8114 struct coding_system coding
;
8116 struct coding_detection_info detect_info
;
8117 enum coding_category base_category
;
8118 bool null_byte_found
= 0, eight_bit_found
= 0;
8120 if (NILP (coding_system
))
8121 coding_system
= Qundecided
;
8122 setup_coding_system (coding_system
, &coding
);
8123 attrs
= CODING_ID_ATTRS (coding
.id
);
8124 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
8125 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
8127 coding
.source
= src
;
8128 coding
.src_chars
= src_chars
;
8129 coding
.src_bytes
= src_bytes
;
8130 coding
.src_multibyte
= multibytep
;
8131 coding
.consumed
= 0;
8132 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8133 coding
.head_ascii
= 0;
8135 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
8137 /* At first, detect text-format if necessary. */
8138 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8139 if (base_category
== coding_category_undecided
)
8141 enum coding_category category
IF_LINT (= 0);
8142 struct coding_system
*this IF_LINT (= NULL
);
8145 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8146 for (; src
< src_end
; src
++)
8151 eight_bit_found
= 1;
8152 if (null_byte_found
)
8157 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
8158 && ! inhibit_iso_escape_detection
8159 && ! detect_info
.checked
)
8161 if (detect_coding_iso_2022 (&coding
, &detect_info
))
8163 /* We have scanned the whole data. */
8164 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
8166 /* We didn't find an 8-bit code. We may
8167 have found a null-byte, but it's very
8168 rare that a binary file confirm to
8171 coding
.head_ascii
= src
- coding
.source
;
8173 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
8177 else if (! c
&& !inhibit_null_byte_detection
)
8179 null_byte_found
= 1;
8180 if (eight_bit_found
)
8183 if (! eight_bit_found
)
8184 coding
.head_ascii
++;
8186 else if (! eight_bit_found
)
8187 coding
.head_ascii
++;
8190 if (null_byte_found
|| eight_bit_found
8191 || coding
.head_ascii
< coding
.src_bytes
8192 || detect_info
.found
)
8194 if (coding
.head_ascii
== coding
.src_bytes
)
8195 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8196 for (i
= 0; i
< coding_category_raw_text
; i
++)
8198 category
= coding_priorities
[i
];
8199 this = coding_categories
+ category
;
8200 if (detect_info
.found
& (1 << category
))
8205 if (null_byte_found
)
8207 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
8208 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
8210 for (i
= 0; i
< coding_category_raw_text
; i
++)
8212 category
= coding_priorities
[i
];
8213 this = coding_categories
+ category
;
8217 /* No coding system of this category is defined. */
8218 detect_info
.rejected
|= (1 << category
);
8220 else if (category
>= coding_category_raw_text
)
8222 else if (detect_info
.checked
& (1 << category
))
8225 && (detect_info
.found
& (1 << category
)))
8228 else if ((*(this->detector
)) (&coding
, &detect_info
)
8230 && (detect_info
.found
& (1 << category
)))
8232 if (category
== coding_category_utf_16_auto
)
8234 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8235 category
= coding_category_utf_16_le
;
8237 category
= coding_category_utf_16_be
;
8245 if ((detect_info
.rejected
& CATEGORY_MASK_ANY
) == CATEGORY_MASK_ANY
8248 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
8249 id
= CODING_SYSTEM_ID (Qno_conversion
);
8250 val
= Fcons (make_number (id
), Qnil
);
8252 else if (! detect_info
.rejected
&& ! detect_info
.found
)
8254 detect_info
.found
= CATEGORY_MASK_ANY
;
8255 id
= coding_categories
[coding_category_undecided
].id
;
8256 val
= Fcons (make_number (id
), Qnil
);
8260 if (detect_info
.found
)
8262 detect_info
.found
= 1 << category
;
8263 val
= Fcons (make_number (this->id
), Qnil
);
8266 for (i
= 0; i
< coding_category_raw_text
; i
++)
8267 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
8269 detect_info
.found
= 1 << coding_priorities
[i
];
8270 id
= coding_categories
[coding_priorities
[i
]].id
;
8271 val
= Fcons (make_number (id
), Qnil
);
8277 int mask
= detect_info
.rejected
| detect_info
.found
;
8280 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8282 category
= coding_priorities
[i
];
8283 if (! (mask
& (1 << category
)))
8285 found
|= 1 << category
;
8286 id
= coding_categories
[category
].id
;
8288 val
= Fcons (make_number (id
), val
);
8291 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8293 category
= coding_priorities
[i
];
8294 if (detect_info
.found
& (1 << category
))
8296 id
= coding_categories
[category
].id
;
8297 val
= Fcons (make_number (id
), val
);
8300 detect_info
.found
|= found
;
8303 else if (base_category
== coding_category_utf_8_auto
)
8305 if (detect_coding_utf_8 (&coding
, &detect_info
))
8307 struct coding_system
*this;
8309 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
8310 this = coding_categories
+ coding_category_utf_8_sig
;
8312 this = coding_categories
+ coding_category_utf_8_nosig
;
8313 val
= Fcons (make_number (this->id
), Qnil
);
8316 else if (base_category
== coding_category_utf_16_auto
)
8318 if (detect_coding_utf_16 (&coding
, &detect_info
))
8320 struct coding_system
*this;
8322 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8323 this = coding_categories
+ coding_category_utf_16_le
;
8324 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
8325 this = coding_categories
+ coding_category_utf_16_be
;
8326 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
8327 this = coding_categories
+ coding_category_utf_16_be_nosig
;
8329 this = coding_categories
+ coding_category_utf_16_le_nosig
;
8330 val
= Fcons (make_number (this->id
), Qnil
);
8335 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
8336 val
= Fcons (make_number (coding
.id
), Qnil
);
8339 /* Then, detect eol-format if necessary. */
8341 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
= -1;
8344 if (VECTORP (eol_type
))
8346 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
8348 if (null_byte_found
)
8349 normal_eol
= EOL_SEEN_LF
;
8351 normal_eol
= detect_eol (coding
.source
, src_bytes
,
8352 coding_category_raw_text
);
8354 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
8355 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
8356 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
8357 coding_category_utf_16_be
);
8358 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
8359 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
8360 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
8361 coding_category_utf_16_le
);
8365 if (EQ (eol_type
, Qunix
))
8366 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
8367 else if (EQ (eol_type
, Qdos
))
8368 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
8370 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
8373 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
8375 enum coding_category category
;
8378 id
= XINT (XCAR (tail
));
8379 attrs
= CODING_ID_ATTRS (id
);
8380 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8381 eol_type
= CODING_ID_EOL_TYPE (id
);
8382 if (VECTORP (eol_type
))
8384 if (category
== coding_category_utf_16_be
8385 || category
== coding_category_utf_16_be_nosig
)
8386 this_eol
= utf_16_be_eol
;
8387 else if (category
== coding_category_utf_16_le
8388 || category
== coding_category_utf_16_le_nosig
)
8389 this_eol
= utf_16_le_eol
;
8391 this_eol
= normal_eol
;
8393 if (this_eol
== EOL_SEEN_LF
)
8394 XSETCAR (tail
, AREF (eol_type
, 0));
8395 else if (this_eol
== EOL_SEEN_CRLF
)
8396 XSETCAR (tail
, AREF (eol_type
, 1));
8397 else if (this_eol
== EOL_SEEN_CR
)
8398 XSETCAR (tail
, AREF (eol_type
, 2));
8400 XSETCAR (tail
, CODING_ID_NAME (id
));
8403 XSETCAR (tail
, CODING_ID_NAME (id
));
8407 return (highest
? (CONSP (val
) ? XCAR (val
) : Qnil
) : val
);
8411 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
8413 doc
: /* Detect coding system of the text in the region between START and END.
8414 Return a list of possible coding systems ordered by priority.
8415 The coding systems to try and their priorities follows what
8416 the function `coding-system-priority-list' (which see) returns.
8418 If only ASCII characters are found (except for such ISO-2022 control
8419 characters as ESC), it returns a list of single element `undecided'
8420 or its subsidiary coding system according to a detected end-of-line
8423 If optional argument HIGHEST is non-nil, return the coding system of
8424 highest priority. */)
8425 (Lisp_Object start
, Lisp_Object end
, Lisp_Object highest
)
8428 ptrdiff_t from_byte
, to_byte
;
8430 CHECK_NUMBER_COERCE_MARKER (start
);
8431 CHECK_NUMBER_COERCE_MARKER (end
);
8433 validate_region (&start
, &end
);
8434 from
= XINT (start
), to
= XINT (end
);
8435 from_byte
= CHAR_TO_BYTE (from
);
8436 to_byte
= CHAR_TO_BYTE (to
);
8438 if (from
< GPT
&& to
>= GPT
)
8439 move_gap_both (to
, to_byte
);
8441 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
8442 to
- from
, to_byte
- from_byte
,
8444 !NILP (BVAR (current_buffer
8445 , enable_multibyte_characters
)),
8449 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
8451 doc
: /* Detect coding system of the text in STRING.
8452 Return a list of possible coding systems ordered by priority.
8453 The coding systems to try and their priorities follows what
8454 the function `coding-system-priority-list' (which see) returns.
8456 If only ASCII characters are found (except for such ISO-2022 control
8457 characters as ESC), it returns a list of single element `undecided'
8458 or its subsidiary coding system according to a detected end-of-line
8461 If optional argument HIGHEST is non-nil, return the coding system of
8462 highest priority. */)
8463 (Lisp_Object string
, Lisp_Object highest
)
8465 CHECK_STRING (string
);
8467 return detect_coding_system (SDATA (string
),
8468 SCHARS (string
), SBYTES (string
),
8469 !NILP (highest
), STRING_MULTIBYTE (string
),
8475 char_encodable_p (int c
, Lisp_Object attrs
)
8478 struct charset
*charset
;
8479 Lisp_Object translation_table
;
8481 translation_table
= CODING_ATTR_TRANS_TBL (attrs
);
8482 if (! NILP (translation_table
))
8483 c
= translate_char (translation_table
, c
);
8484 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
8485 CONSP (tail
); tail
= XCDR (tail
))
8487 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
8488 if (CHAR_CHARSET_P (c
, charset
))
8491 return (! NILP (tail
));
8495 /* Return a list of coding systems that safely encode the text between
8496 START and END. If EXCLUDE is non-nil, it is a list of coding
8497 systems not to check. The returned list doesn't contain any such
8498 coding systems. In any case, if the text contains only ASCII or is
8499 unibyte, return t. */
8501 DEFUN ("find-coding-systems-region-internal",
8502 Ffind_coding_systems_region_internal
,
8503 Sfind_coding_systems_region_internal
, 2, 3, 0,
8504 doc
: /* Internal use only. */)
8505 (Lisp_Object start
, Lisp_Object end
, Lisp_Object exclude
)
8507 Lisp_Object coding_attrs_list
, safe_codings
;
8508 ptrdiff_t start_byte
, end_byte
;
8509 const unsigned char *p
, *pbeg
, *pend
;
8511 Lisp_Object tail
, elt
, work_table
;
8513 if (STRINGP (start
))
8515 if (!STRING_MULTIBYTE (start
)
8516 || SCHARS (start
) == SBYTES (start
))
8519 end_byte
= SBYTES (start
);
8523 CHECK_NUMBER_COERCE_MARKER (start
);
8524 CHECK_NUMBER_COERCE_MARKER (end
);
8525 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
8526 args_out_of_range (start
, end
);
8527 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8529 start_byte
= CHAR_TO_BYTE (XINT (start
));
8530 end_byte
= CHAR_TO_BYTE (XINT (end
));
8531 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
8534 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
8536 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
8537 move_gap_both (XINT (start
), start_byte
);
8539 move_gap_both (XINT (end
), end_byte
);
8543 coding_attrs_list
= Qnil
;
8544 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
8546 || NILP (Fmemq (XCAR (tail
), exclude
)))
8550 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
8551 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
8552 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
8554 ASET (attrs
, coding_attr_trans_tbl
,
8555 get_translation_table (attrs
, 1, NULL
));
8556 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
8560 if (STRINGP (start
))
8561 p
= pbeg
= SDATA (start
);
8563 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
8564 pend
= p
+ (end_byte
- start_byte
);
8566 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
8567 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
8569 work_table
= Fmake_char_table (Qnil
, Qnil
);
8572 if (ASCII_BYTE_P (*p
))
8576 c
= STRING_CHAR_ADVANCE (p
);
8577 if (!NILP (char_table_ref (work_table
, c
)))
8578 /* This character was already checked. Ignore it. */
8581 charset_map_loaded
= 0;
8582 for (tail
= coding_attrs_list
; CONSP (tail
);)
8587 else if (char_encodable_p (c
, elt
))
8589 else if (CONSP (XCDR (tail
)))
8591 XSETCAR (tail
, XCAR (XCDR (tail
)));
8592 XSETCDR (tail
, XCDR (XCDR (tail
)));
8596 XSETCAR (tail
, Qnil
);
8600 if (charset_map_loaded
)
8602 ptrdiff_t p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
8604 if (STRINGP (start
))
8605 pbeg
= SDATA (start
);
8607 pbeg
= BYTE_POS_ADDR (start_byte
);
8608 p
= pbeg
+ p_offset
;
8609 pend
= pbeg
+ pend_offset
;
8611 char_table_set (work_table
, c
, Qt
);
8615 safe_codings
= list2 (Qraw_text
, Qno_conversion
);
8616 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
8617 if (! NILP (XCAR (tail
)))
8618 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
8620 return safe_codings
;
8624 DEFUN ("unencodable-char-position", Funencodable_char_position
,
8625 Sunencodable_char_position
, 3, 5, 0,
8627 Return position of first un-encodable character in a region.
8628 START and END specify the region and CODING-SYSTEM specifies the
8629 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8631 If optional 4th argument COUNT is non-nil, it specifies at most how
8632 many un-encodable characters to search. In this case, the value is a
8635 If optional 5th argument STRING is non-nil, it is a string to search
8636 for un-encodable characters. In that case, START and END are indexes
8638 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object count
, Lisp_Object string
)
8641 struct coding_system coding
;
8642 Lisp_Object attrs
, charset_list
, translation_table
;
8643 Lisp_Object positions
;
8645 const unsigned char *p
, *stop
, *pend
;
8646 bool ascii_compatible
;
8648 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
8649 attrs
= CODING_ID_ATTRS (coding
.id
);
8650 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
8652 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
8653 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
8654 translation_table
= get_translation_table (attrs
, 1, NULL
);
8658 validate_region (&start
, &end
);
8659 from
= XINT (start
);
8661 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
8662 || (ascii_compatible
8663 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
8665 p
= CHAR_POS_ADDR (from
);
8666 pend
= CHAR_POS_ADDR (to
);
8667 if (from
< GPT
&& to
>= GPT
)
8674 CHECK_STRING (string
);
8675 CHECK_NATNUM (start
);
8677 if (! (XINT (start
) <= XINT (end
) && XINT (end
) <= SCHARS (string
)))
8678 args_out_of_range_3 (string
, start
, end
);
8679 from
= XINT (start
);
8681 if (! STRING_MULTIBYTE (string
))
8683 p
= SDATA (string
) + string_char_to_byte (string
, from
);
8684 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
8685 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
8693 CHECK_NATNUM (count
);
8698 charset_map_loaded
= 0;
8703 if (ascii_compatible
)
8704 while (p
< stop
&& ASCII_BYTE_P (*p
))
8714 c
= STRING_CHAR_ADVANCE (p
);
8715 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
8716 && ! char_charset (translate_char (translation_table
, c
),
8717 charset_list
, NULL
))
8719 positions
= Fcons (make_number (from
), positions
);
8726 if (charset_map_loaded
&& NILP (string
))
8728 p
= CHAR_POS_ADDR (from
);
8729 pend
= CHAR_POS_ADDR (to
);
8730 if (from
< GPT
&& to
>= GPT
)
8734 charset_map_loaded
= 0;
8738 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
8742 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
8743 Scheck_coding_systems_region
, 3, 3, 0,
8744 doc
: /* Check if the region is encodable by coding systems.
8746 START and END are buffer positions specifying the region.
8747 CODING-SYSTEM-LIST is a list of coding systems to check.
8749 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8750 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8751 whole region, POS0, POS1, ... are buffer positions where non-encodable
8752 characters are found.
8754 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8757 START may be a string. In that case, check if the string is
8758 encodable, and the value contains indices to the string instead of
8759 buffer positions. END is ignored.
8761 If the current buffer (or START if it is a string) is unibyte, the value
8763 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system_list
)
8766 ptrdiff_t start_byte
, end_byte
;
8768 const unsigned char *p
, *pbeg
, *pend
;
8770 Lisp_Object tail
, elt
, attrs
;
8772 if (STRINGP (start
))
8774 if (!STRING_MULTIBYTE (start
)
8775 || SCHARS (start
) == SBYTES (start
))
8778 end_byte
= SBYTES (start
);
8783 CHECK_NUMBER_COERCE_MARKER (start
);
8784 CHECK_NUMBER_COERCE_MARKER (end
);
8785 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
8786 args_out_of_range (start
, end
);
8787 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8789 start_byte
= CHAR_TO_BYTE (XINT (start
));
8790 end_byte
= CHAR_TO_BYTE (XINT (end
));
8791 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
8794 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
8796 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
8797 move_gap_both (XINT (start
), start_byte
);
8799 move_gap_both (XINT (end
), end_byte
);
8805 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
8808 attrs
= AREF (CODING_SYSTEM_SPEC (elt
), 0);
8809 ASET (attrs
, coding_attr_trans_tbl
,
8810 get_translation_table (attrs
, 1, NULL
));
8811 list
= Fcons (Fcons (elt
, Fcons (attrs
, Qnil
)), list
);
8814 if (STRINGP (start
))
8815 p
= pbeg
= SDATA (start
);
8817 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
8818 pend
= p
+ (end_byte
- start_byte
);
8820 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
8821 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
8825 if (ASCII_BYTE_P (*p
))
8829 c
= STRING_CHAR_ADVANCE (p
);
8831 charset_map_loaded
= 0;
8832 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
8834 elt
= XCDR (XCAR (tail
));
8835 if (! char_encodable_p (c
, XCAR (elt
)))
8836 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
8838 if (charset_map_loaded
)
8840 ptrdiff_t p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
8842 if (STRINGP (start
))
8843 pbeg
= SDATA (start
);
8845 pbeg
= BYTE_POS_ADDR (start_byte
);
8846 p
= pbeg
+ p_offset
;
8847 pend
= pbeg
+ pend_offset
;
8855 for (; CONSP (tail
); tail
= XCDR (tail
))
8858 if (CONSP (XCDR (XCDR (elt
))))
8859 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
8868 code_convert_region (Lisp_Object start
, Lisp_Object end
,
8869 Lisp_Object coding_system
, Lisp_Object dst_object
,
8870 bool encodep
, bool norecord
)
8872 struct coding_system coding
;
8873 ptrdiff_t from
, from_byte
, to
, to_byte
;
8874 Lisp_Object src_object
;
8876 CHECK_NUMBER_COERCE_MARKER (start
);
8877 CHECK_NUMBER_COERCE_MARKER (end
);
8878 if (NILP (coding_system
))
8879 coding_system
= Qno_conversion
;
8881 CHECK_CODING_SYSTEM (coding_system
);
8882 src_object
= Fcurrent_buffer ();
8883 if (NILP (dst_object
))
8884 dst_object
= src_object
;
8885 else if (! EQ (dst_object
, Qt
))
8886 CHECK_BUFFER (dst_object
);
8888 validate_region (&start
, &end
);
8889 from
= XFASTINT (start
);
8890 from_byte
= CHAR_TO_BYTE (from
);
8891 to
= XFASTINT (end
);
8892 to_byte
= CHAR_TO_BYTE (to
);
8894 setup_coding_system (coding_system
, &coding
);
8895 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8898 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
8901 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
8904 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
8906 return (BUFFERP (dst_object
)
8907 ? make_number (coding
.produced_char
)
8908 : coding
.dst_object
);
8912 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
8913 3, 4, "r\nzCoding system: ",
8914 doc
: /* Decode the current region from the specified coding system.
8915 When called from a program, takes four arguments:
8916 START, END, CODING-SYSTEM, and DESTINATION.
8917 START and END are buffer positions.
8919 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8920 If nil, the region between START and END is replaced by the decoded text.
8921 If buffer, the decoded text is inserted in that buffer after point (point
8923 In those cases, the length of the decoded text is returned.
8924 If DESTINATION is t, the decoded text is returned.
8926 This function sets `last-coding-system-used' to the precise coding system
8927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8928 not fully specified.) */)
8929 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
8931 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
8934 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
8935 3, 4, "r\nzCoding system: ",
8936 doc
: /* Encode the current region by specified coding system.
8937 When called from a program, takes four arguments:
8938 START, END, CODING-SYSTEM and DESTINATION.
8939 START and END are buffer positions.
8941 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8942 If nil, the region between START and END is replace by the encoded text.
8943 If buffer, the encoded text is inserted in that buffer after point (point
8945 In those cases, the length of the encoded text is returned.
8946 If DESTINATION is t, the encoded text is returned.
8948 This function sets `last-coding-system-used' to the precise coding system
8949 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8950 not fully specified.) */)
8951 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
8953 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
8957 code_convert_string (Lisp_Object string
, Lisp_Object coding_system
,
8958 Lisp_Object dst_object
, bool encodep
, bool nocopy
,
8961 struct coding_system coding
;
8962 ptrdiff_t chars
, bytes
;
8964 CHECK_STRING (string
);
8965 if (NILP (coding_system
))
8968 Vlast_coding_system_used
= Qno_conversion
;
8969 if (NILP (dst_object
))
8970 return (nocopy
? Fcopy_sequence (string
) : string
);
8973 if (NILP (coding_system
))
8974 coding_system
= Qno_conversion
;
8976 CHECK_CODING_SYSTEM (coding_system
);
8977 if (NILP (dst_object
))
8979 else if (! EQ (dst_object
, Qt
))
8980 CHECK_BUFFER (dst_object
);
8982 setup_coding_system (coding_system
, &coding
);
8983 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8984 chars
= SCHARS (string
);
8985 bytes
= SBYTES (string
);
8987 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
8989 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
8991 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
8993 return (BUFFERP (dst_object
)
8994 ? make_number (coding
.produced_char
)
8995 : coding
.dst_object
);
8999 /* Encode or decode STRING according to CODING_SYSTEM.
9000 Do not set Vlast_coding_system_used.
9002 This function is called only from macros DECODE_FILE and
9003 ENCODE_FILE, thus we ignore character composition. */
9006 code_convert_string_norecord (Lisp_Object string
, Lisp_Object coding_system
,
9009 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
9013 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
9015 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9017 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9018 if the decoding operation is trivial.
9020 Optional fourth arg BUFFER non-nil means that the decoded text is
9021 inserted in that buffer after point (point does not move). In this
9022 case, the return value is the length of the decoded text.
9024 This function sets `last-coding-system-used' to the precise coding system
9025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9026 not fully specified.) */)
9027 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9029 return code_convert_string (string
, coding_system
, buffer
,
9030 0, ! NILP (nocopy
), 0);
9033 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
9035 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
9037 Optional third arg NOCOPY non-nil means it is OK to return STRING
9038 itself if the encoding operation is trivial.
9040 Optional fourth arg BUFFER non-nil means that the encoded text is
9041 inserted in that buffer after point (point does not move). In this
9042 case, the return value is the length of the encoded text.
9044 This function sets `last-coding-system-used' to the precise coding system
9045 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9046 not fully specified.) */)
9047 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9049 return code_convert_string (string
, coding_system
, buffer
,
9050 1, ! NILP (nocopy
), 0);
9054 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
9055 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
9056 Return the corresponding character. */)
9059 Lisp_Object spec
, attrs
, val
;
9060 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
9064 CHECK_NATNUM (code
);
9065 ch
= XFASTINT (code
);
9066 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9067 attrs
= AREF (spec
, 0);
9069 if (ASCII_BYTE_P (ch
)
9070 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9073 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9074 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9075 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9076 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9081 charset
= charset_roman
;
9083 else if (ch
>= 0xA0 && ch
< 0xDF)
9086 charset
= charset_kana
;
9090 EMACS_INT c1
= ch
>> 8;
9093 if (c1
< 0x81 || (c1
> 0x9F && c1
< 0xE0) || c1
> 0xEF
9094 || c2
< 0x40 || c2
== 0x7F || c2
> 0xFC)
9095 error ("Invalid code: %"pI
"d", ch
);
9098 charset
= charset_kanji
;
9100 c
= DECODE_CHAR (charset
, c
);
9102 error ("Invalid code: %"pI
"d", ch
);
9103 return make_number (c
);
9107 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
9108 doc
: /* Encode a Japanese character CH to shift_jis encoding.
9109 Return the corresponding code in SJIS. */)
9112 Lisp_Object spec
, attrs
, charset_list
;
9114 struct charset
*charset
;
9117 CHECK_CHARACTER (ch
);
9119 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9120 attrs
= AREF (spec
, 0);
9122 if (ASCII_CHAR_P (c
)
9123 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9126 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9127 charset
= char_charset (c
, charset_list
, &code
);
9128 if (code
== CHARSET_INVALID_CODE (charset
))
9129 error ("Can't encode by shift_jis encoding: %c", c
);
9132 return make_number (code
);
9135 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
9136 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
9137 Return the corresponding character. */)
9140 Lisp_Object spec
, attrs
, val
;
9141 struct charset
*charset_roman
, *charset_big5
, *charset
;
9145 CHECK_NATNUM (code
);
9146 ch
= XFASTINT (code
);
9147 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9148 attrs
= AREF (spec
, 0);
9150 if (ASCII_BYTE_P (ch
)
9151 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9154 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9155 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9156 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9161 charset
= charset_roman
;
9165 EMACS_INT b1
= ch
>> 8;
9167 if (b1
< 0xA1 || b1
> 0xFE
9168 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
9169 error ("Invalid code: %"pI
"d", ch
);
9171 charset
= charset_big5
;
9173 c
= DECODE_CHAR (charset
, c
);
9175 error ("Invalid code: %"pI
"d", ch
);
9176 return make_number (c
);
9179 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
9180 doc
: /* Encode the Big5 character CH to BIG5 coding system.
9181 Return the corresponding character code in Big5. */)
9184 Lisp_Object spec
, attrs
, charset_list
;
9185 struct charset
*charset
;
9189 CHECK_CHARACTER (ch
);
9191 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9192 attrs
= AREF (spec
, 0);
9193 if (ASCII_CHAR_P (c
)
9194 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9197 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9198 charset
= char_charset (c
, charset_list
, &code
);
9199 if (code
== CHARSET_INVALID_CODE (charset
))
9200 error ("Can't encode by Big5 encoding: %c", c
);
9202 return make_number (code
);
9206 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal
,
9207 Sset_terminal_coding_system_internal
, 1, 2, 0,
9208 doc
: /* Internal use only. */)
9209 (Lisp_Object coding_system
, Lisp_Object terminal
)
9211 struct terminal
*term
= get_terminal (terminal
, 1);
9212 struct coding_system
*terminal_coding
= TERMINAL_TERMINAL_CODING (term
);
9213 CHECK_SYMBOL (coding_system
);
9214 setup_coding_system (Fcheck_coding_system (coding_system
), terminal_coding
);
9215 /* We had better not send unsafe characters to terminal. */
9216 terminal_coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
9217 /* Character composition should be disabled. */
9218 terminal_coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9219 terminal_coding
->src_multibyte
= 1;
9220 terminal_coding
->dst_multibyte
= 0;
9222 (term
, (terminal_coding
->common_flags
& CODING_REQUIRE_ENCODING_MASK
9223 ? coding_charset_list (terminal_coding
)
9224 : Fcons (make_number (charset_ascii
), Qnil
)));
9228 DEFUN ("set-safe-terminal-coding-system-internal",
9229 Fset_safe_terminal_coding_system_internal
,
9230 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
9231 doc
: /* Internal use only. */)
9232 (Lisp_Object coding_system
)
9234 CHECK_SYMBOL (coding_system
);
9235 setup_coding_system (Fcheck_coding_system (coding_system
),
9236 &safe_terminal_coding
);
9237 /* Character composition should be disabled. */
9238 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9239 safe_terminal_coding
.src_multibyte
= 1;
9240 safe_terminal_coding
.dst_multibyte
= 0;
9244 DEFUN ("terminal-coding-system", Fterminal_coding_system
,
9245 Sterminal_coding_system
, 0, 1, 0,
9246 doc
: /* Return coding system specified for terminal output on the given terminal.
9247 TERMINAL may be a terminal object, a frame, or nil for the selected
9248 frame's terminal device. */)
9249 (Lisp_Object terminal
)
9251 struct coding_system
*terminal_coding
9252 = TERMINAL_TERMINAL_CODING (get_terminal (terminal
, 1));
9253 Lisp_Object coding_system
= CODING_ID_NAME (terminal_coding
->id
);
9255 /* For backward compatibility, return nil if it is `undecided'. */
9256 return (! EQ (coding_system
, Qundecided
) ? coding_system
: Qnil
);
9259 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal
,
9260 Sset_keyboard_coding_system_internal
, 1, 2, 0,
9261 doc
: /* Internal use only. */)
9262 (Lisp_Object coding_system
, Lisp_Object terminal
)
9264 struct terminal
*t
= get_terminal (terminal
, 1);
9265 CHECK_SYMBOL (coding_system
);
9266 if (NILP (coding_system
))
9267 coding_system
= Qno_conversion
;
9269 Fcheck_coding_system (coding_system
);
9270 setup_coding_system (coding_system
, TERMINAL_KEYBOARD_CODING (t
));
9271 /* Character composition should be disabled. */
9272 TERMINAL_KEYBOARD_CODING (t
)->common_flags
9273 &= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9277 DEFUN ("keyboard-coding-system",
9278 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 1, 0,
9279 doc
: /* Return coding system specified for decoding keyboard input. */)
9280 (Lisp_Object terminal
)
9282 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9283 (get_terminal (terminal
, 1))->id
);
9287 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
9288 Sfind_operation_coding_system
, 1, MANY
, 0,
9289 doc
: /* Choose a coding system for an operation based on the target name.
9290 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9291 DECODING-SYSTEM is the coding system to use for decoding
9292 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9293 for encoding (in case OPERATION does encoding).
9295 The first argument OPERATION specifies an I/O primitive:
9296 For file I/O, `insert-file-contents' or `write-region'.
9297 For process I/O, `call-process', `call-process-region', or `start-process'.
9298 For network I/O, `open-network-stream'.
9300 The remaining arguments should be the same arguments that were passed
9301 to the primitive. Depending on which primitive, one of those arguments
9302 is selected as the TARGET. For example, if OPERATION does file I/O,
9303 whichever argument specifies the file name is TARGET.
9305 TARGET has a meaning which depends on OPERATION:
9306 For file I/O, TARGET is a file name (except for the special case below).
9307 For process I/O, TARGET is a process name.
9308 For network I/O, TARGET is a service name or a port number.
9310 This function looks up what is specified for TARGET in
9311 `file-coding-system-alist', `process-coding-system-alist',
9312 or `network-coding-system-alist' depending on OPERATION.
9313 They may specify a coding system, a cons of coding systems,
9314 or a function symbol to call.
9315 In the last case, we call the function with one argument,
9316 which is a list of all the arguments given to this function.
9317 If the function can't decide a coding system, it can return
9318 `undecided' so that the normal code-detection is performed.
9320 If OPERATION is `insert-file-contents', the argument corresponding to
9321 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9322 file name to look up, and BUFFER is a buffer that contains the file's
9323 contents (not yet decoded). If `file-coding-system-alist' specifies a
9324 function to call for FILENAME, that function should examine the
9325 contents of BUFFER instead of reading the file.
9327 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9328 (ptrdiff_t nargs
, Lisp_Object
*args
)
9330 Lisp_Object operation
, target_idx
, target
, val
;
9331 register Lisp_Object chain
;
9334 error ("Too few arguments");
9335 operation
= args
[0];
9336 if (!SYMBOLP (operation
)
9337 || (target_idx
= Fget (operation
, Qtarget_idx
), !NATNUMP (target_idx
)))
9338 error ("Invalid first argument");
9339 if (nargs
<= 1 + XFASTINT (target_idx
))
9340 error ("Too few arguments for operation `%s'",
9341 SDATA (SYMBOL_NAME (operation
)));
9342 target
= args
[XFASTINT (target_idx
) + 1];
9343 if (!(STRINGP (target
)
9344 || (EQ (operation
, Qinsert_file_contents
) && CONSP (target
)
9345 && STRINGP (XCAR (target
)) && BUFFERP (XCDR (target
)))
9346 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
9347 error ("Invalid argument %"pI
"d of operation `%s'",
9348 XFASTINT (target_idx
) + 1, SDATA (SYMBOL_NAME (operation
)));
9350 target
= XCAR (target
);
9352 chain
= ((EQ (operation
, Qinsert_file_contents
)
9353 || EQ (operation
, Qwrite_region
))
9354 ? Vfile_coding_system_alist
9355 : (EQ (operation
, Qopen_network_stream
)
9356 ? Vnetwork_coding_system_alist
9357 : Vprocess_coding_system_alist
));
9361 for (; CONSP (chain
); chain
= XCDR (chain
))
9367 && ((STRINGP (target
)
9368 && STRINGP (XCAR (elt
))
9369 && fast_string_match (XCAR (elt
), target
) >= 0)
9370 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
9373 /* Here, if VAL is both a valid coding system and a valid
9374 function symbol, we return VAL as a coding system. */
9377 if (! SYMBOLP (val
))
9379 if (! NILP (Fcoding_system_p (val
)))
9380 return Fcons (val
, val
);
9381 if (! NILP (Ffboundp (val
)))
9383 /* We use call1 rather than safe_call1
9384 so as to get bug reports about functions called here
9385 which don't handle the current interface. */
9386 val
= call1 (val
, Flist (nargs
, args
));
9389 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
9390 return Fcons (val
, val
);
9398 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
9399 Sset_coding_system_priority
, 0, MANY
, 0,
9400 doc
: /* Assign higher priority to the coding systems given as arguments.
9401 If multiple coding systems belong to the same category,
9402 all but the first one are ignored.
9404 usage: (set-coding-system-priority &rest coding-systems) */)
9405 (ptrdiff_t nargs
, Lisp_Object
*args
)
9408 bool changed
[coding_category_max
];
9409 enum coding_category priorities
[coding_category_max
];
9411 memset (changed
, 0, sizeof changed
);
9413 for (i
= j
= 0; i
< nargs
; i
++)
9415 enum coding_category category
;
9416 Lisp_Object spec
, attrs
;
9418 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
9419 attrs
= AREF (spec
, 0);
9420 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
9421 if (changed
[category
])
9422 /* Ignore this coding system because a coding system of the
9423 same category already had a higher priority. */
9425 changed
[category
] = 1;
9426 priorities
[j
++] = category
;
9427 if (coding_categories
[category
].id
>= 0
9428 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
9429 setup_coding_system (args
[i
], &coding_categories
[category
]);
9430 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
9433 /* Now we have decided top J priorities. Reflect the order of the
9434 original priorities to the remaining priorities. */
9436 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
9438 while (j
< coding_category_max
9439 && changed
[coding_priorities
[j
]])
9441 if (j
== coding_category_max
)
9443 priorities
[i
] = coding_priorities
[j
];
9446 memcpy (coding_priorities
, priorities
, sizeof priorities
);
9448 /* Update `coding-category-list'. */
9449 Vcoding_category_list
= Qnil
;
9450 for (i
= coding_category_max
; i
-- > 0; )
9451 Vcoding_category_list
9452 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
9453 Vcoding_category_list
);
9458 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
9459 Scoding_system_priority_list
, 0, 1, 0,
9460 doc
: /* Return a list of coding systems ordered by their priorities.
9461 The list contains a subset of coding systems; i.e. coding systems
9462 assigned to each coding category (see `coding-category-list').
9464 HIGHESTP non-nil means just return the highest priority one. */)
9465 (Lisp_Object highestp
)
9470 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
9472 enum coding_category category
= coding_priorities
[i
];
9473 int id
= coding_categories
[category
].id
;
9478 attrs
= CODING_ID_ATTRS (id
);
9479 if (! NILP (highestp
))
9480 return CODING_ATTR_BASE_NAME (attrs
);
9481 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
9483 return Fnreverse (val
);
9486 static const char *const suffixes
[] = { "-unix", "-dos", "-mac" };
9489 make_subsidiaries (Lisp_Object base
)
9491 Lisp_Object subsidiaries
;
9492 ptrdiff_t base_name_len
= SBYTES (SYMBOL_NAME (base
));
9493 char *buf
= alloca (base_name_len
+ 6);
9496 memcpy (buf
, SDATA (SYMBOL_NAME (base
)), base_name_len
);
9497 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
9498 for (i
= 0; i
< 3; i
++)
9500 strcpy (buf
+ base_name_len
, suffixes
[i
]);
9501 ASET (subsidiaries
, i
, intern (buf
));
9503 return subsidiaries
;
9507 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
9508 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
9509 doc
: /* For internal use only.
9510 usage: (define-coding-system-internal ...) */)
9511 (ptrdiff_t nargs
, Lisp_Object
*args
)
9514 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
9515 Lisp_Object attrs
; /* Vector of attributes. */
9516 Lisp_Object eol_type
;
9517 Lisp_Object aliases
;
9518 Lisp_Object coding_type
, charset_list
, safe_charsets
;
9519 enum coding_category category
;
9520 Lisp_Object tail
, val
;
9521 int max_charset_id
= 0;
9524 if (nargs
< coding_arg_max
)
9527 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
9529 name
= args
[coding_arg_name
];
9530 CHECK_SYMBOL (name
);
9531 ASET (attrs
, coding_attr_base_name
, name
);
9533 val
= args
[coding_arg_mnemonic
];
9534 if (! STRINGP (val
))
9535 CHECK_CHARACTER (val
);
9536 ASET (attrs
, coding_attr_mnemonic
, val
);
9538 coding_type
= args
[coding_arg_coding_type
];
9539 CHECK_SYMBOL (coding_type
);
9540 ASET (attrs
, coding_attr_type
, coding_type
);
9542 charset_list
= args
[coding_arg_charset_list
];
9543 if (SYMBOLP (charset_list
))
9545 if (EQ (charset_list
, Qiso_2022
))
9547 if (! EQ (coding_type
, Qiso_2022
))
9548 error ("Invalid charset-list");
9549 charset_list
= Viso_2022_charset_list
;
9551 else if (EQ (charset_list
, Qemacs_mule
))
9553 if (! EQ (coding_type
, Qemacs_mule
))
9554 error ("Invalid charset-list");
9555 charset_list
= Vemacs_mule_charset_list
;
9557 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9559 if (! RANGED_INTEGERP (0, XCAR (tail
), INT_MAX
- 1))
9560 error ("Invalid charset-list");
9561 if (max_charset_id
< XFASTINT (XCAR (tail
)))
9562 max_charset_id
= XFASTINT (XCAR (tail
));
9567 charset_list
= Fcopy_sequence (charset_list
);
9568 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9570 struct charset
*charset
;
9573 CHECK_CHARSET_GET_CHARSET (val
, charset
);
9574 if (EQ (coding_type
, Qiso_2022
)
9575 ? CHARSET_ISO_FINAL (charset
) < 0
9576 : EQ (coding_type
, Qemacs_mule
)
9577 ? CHARSET_EMACS_MULE_ID (charset
) < 0
9579 error ("Can't handle charset `%s'",
9580 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9582 XSETCAR (tail
, make_number (charset
->id
));
9583 if (max_charset_id
< charset
->id
)
9584 max_charset_id
= charset
->id
;
9587 ASET (attrs
, coding_attr_charset_list
, charset_list
);
9589 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
9590 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
9591 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9592 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
9593 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
9595 ASET (attrs
, coding_attr_ascii_compat
, args
[coding_arg_ascii_compatible_p
]);
9597 val
= args
[coding_arg_decode_translation_table
];
9598 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9600 ASET (attrs
, coding_attr_decode_tbl
, val
);
9602 val
= args
[coding_arg_encode_translation_table
];
9603 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9605 ASET (attrs
, coding_attr_encode_tbl
, val
);
9607 val
= args
[coding_arg_post_read_conversion
];
9609 ASET (attrs
, coding_attr_post_read
, val
);
9611 val
= args
[coding_arg_pre_write_conversion
];
9613 ASET (attrs
, coding_attr_pre_write
, val
);
9615 val
= args
[coding_arg_default_char
];
9617 ASET (attrs
, coding_attr_default_char
, make_number (' '));
9620 CHECK_CHARACTER (val
);
9621 ASET (attrs
, coding_attr_default_char
, val
);
9624 val
= args
[coding_arg_for_unibyte
];
9625 ASET (attrs
, coding_attr_for_unibyte
, NILP (val
) ? Qnil
: Qt
);
9627 val
= args
[coding_arg_plist
];
9629 ASET (attrs
, coding_attr_plist
, val
);
9631 if (EQ (coding_type
, Qcharset
))
9633 /* Generate a lisp vector of 256 elements. Each element is nil,
9634 integer, or a list of charset IDs.
9636 If Nth element is nil, the byte code N is invalid in this
9639 If Nth element is a number NUM, N is the first byte of a
9640 charset whose ID is NUM.
9642 If Nth element is a list of charset IDs, N is the first byte
9643 of one of them. The list is sorted by dimensions of the
9644 charsets. A charset of smaller dimension comes first. */
9645 val
= Fmake_vector (make_number (256), Qnil
);
9647 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9649 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
9650 int dim
= CHARSET_DIMENSION (charset
);
9651 int idx
= (dim
- 1) * 4;
9653 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9654 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9656 for (i
= charset
->code_space
[idx
];
9657 i
<= charset
->code_space
[idx
+ 1]; i
++)
9659 Lisp_Object tmp
, tmp2
;
9662 tmp
= AREF (val
, i
);
9665 else if (NUMBERP (tmp
))
9667 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
9669 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
9671 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
9675 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
9677 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
9682 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
9685 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
9686 XSETCAR (tmp2
, XCAR (tail
));
9692 ASET (attrs
, coding_attr_charset_valids
, val
);
9693 category
= coding_category_charset
;
9695 else if (EQ (coding_type
, Qccl
))
9699 if (nargs
< coding_arg_ccl_max
)
9702 val
= args
[coding_arg_ccl_decoder
];
9703 CHECK_CCL_PROGRAM (val
);
9705 val
= Fcopy_sequence (val
);
9706 ASET (attrs
, coding_attr_ccl_decoder
, val
);
9708 val
= args
[coding_arg_ccl_encoder
];
9709 CHECK_CCL_PROGRAM (val
);
9711 val
= Fcopy_sequence (val
);
9712 ASET (attrs
, coding_attr_ccl_encoder
, val
);
9714 val
= args
[coding_arg_ccl_valids
];
9715 valids
= Fmake_string (make_number (256), make_number (0));
9716 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
9723 if (! (0 <= XINT (val
) && XINT (val
) <= 255))
9724 args_out_of_range_3 (val
, make_number (0), make_number (255));
9725 from
= to
= XINT (val
);
9730 CHECK_NATNUM_CAR (val
);
9731 CHECK_NUMBER_CDR (val
);
9732 if (XINT (XCAR (val
)) > 255)
9733 args_out_of_range_3 (XCAR (val
),
9734 make_number (0), make_number (255));
9735 from
= XINT (XCAR (val
));
9736 if (! (from
<= XINT (XCDR (val
)) && XINT (XCDR (val
)) <= 255))
9737 args_out_of_range_3 (XCDR (val
),
9738 XCAR (val
), make_number (255));
9739 to
= XINT (XCDR (val
));
9741 for (i
= from
; i
<= to
; i
++)
9742 SSET (valids
, i
, 1);
9744 ASET (attrs
, coding_attr_ccl_valids
, valids
);
9746 category
= coding_category_ccl
;
9748 else if (EQ (coding_type
, Qutf_16
))
9750 Lisp_Object bom
, endian
;
9752 ASET (attrs
, coding_attr_ascii_compat
, Qnil
);
9754 if (nargs
< coding_arg_utf16_max
)
9757 bom
= args
[coding_arg_utf16_bom
];
9758 if (! NILP (bom
) && ! EQ (bom
, Qt
))
9762 CHECK_CODING_SYSTEM (val
);
9764 CHECK_CODING_SYSTEM (val
);
9766 ASET (attrs
, coding_attr_utf_bom
, bom
);
9768 endian
= args
[coding_arg_utf16_endian
];
9769 CHECK_SYMBOL (endian
);
9772 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
9773 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
9774 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
9776 category
= (CONSP (bom
)
9777 ? coding_category_utf_16_auto
9779 ? (EQ (endian
, Qbig
)
9780 ? coding_category_utf_16_be_nosig
9781 : coding_category_utf_16_le_nosig
)
9782 : (EQ (endian
, Qbig
)
9783 ? coding_category_utf_16_be
9784 : coding_category_utf_16_le
));
9786 else if (EQ (coding_type
, Qiso_2022
))
9788 Lisp_Object initial
, reg_usage
, request
, flags
;
9790 if (nargs
< coding_arg_iso2022_max
)
9793 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
9794 CHECK_VECTOR (initial
);
9795 for (i
= 0; i
< 4; i
++)
9797 val
= Faref (initial
, make_number (i
));
9800 struct charset
*charset
;
9802 CHECK_CHARSET_GET_CHARSET (val
, charset
);
9803 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
9804 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
9805 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9808 ASET (initial
, i
, make_number (-1));
9811 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
9812 CHECK_CONS (reg_usage
);
9813 CHECK_NUMBER_CAR (reg_usage
);
9814 CHECK_NUMBER_CDR (reg_usage
);
9816 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
9817 for (tail
= request
; CONSP (tail
); tail
= XCDR (tail
))
9825 CHECK_CHARSET_GET_ID (tmp1
, id
);
9826 CHECK_NATNUM_CDR (val
);
9827 if (XINT (XCDR (val
)) >= 4)
9828 error ("Invalid graphic register number: %"pI
"d", XINT (XCDR (val
)));
9829 XSETCAR (val
, make_number (id
));
9832 flags
= args
[coding_arg_iso2022_flags
];
9833 CHECK_NATNUM (flags
);
9834 i
= XINT (flags
) & INT_MAX
;
9835 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
9836 i
|= CODING_ISO_FLAG_FULL_SUPPORT
;
9837 flags
= make_number (i
);
9839 ASET (attrs
, coding_attr_iso_initial
, initial
);
9840 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
9841 ASET (attrs
, coding_attr_iso_request
, request
);
9842 ASET (attrs
, coding_attr_iso_flags
, flags
);
9843 setup_iso_safe_charsets (attrs
);
9845 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
9846 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
9847 | CODING_ISO_FLAG_SINGLE_SHIFT
))
9848 ? coding_category_iso_7_else
9849 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
9850 ? coding_category_iso_7
9851 : coding_category_iso_7_tight
);
9854 int id
= XINT (AREF (initial
, 1));
9856 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
9857 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
9859 ? coding_category_iso_8_else
9860 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
9861 ? coding_category_iso_8_1
9862 : coding_category_iso_8_2
);
9864 if (category
!= coding_category_iso_8_1
9865 && category
!= coding_category_iso_8_2
)
9866 ASET (attrs
, coding_attr_ascii_compat
, Qnil
);
9868 else if (EQ (coding_type
, Qemacs_mule
))
9870 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
9871 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
9872 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9873 category
= coding_category_emacs_mule
;
9875 else if (EQ (coding_type
, Qshift_jis
))
9878 struct charset
*charset
;
9880 if (XINT (Flength (charset_list
)) != 3
9881 && XINT (Flength (charset_list
)) != 4)
9882 error ("There should be three or four charsets");
9884 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9885 if (CHARSET_DIMENSION (charset
) != 1)
9886 error ("Dimension of charset %s is not one",
9887 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9888 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9889 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9891 charset_list
= XCDR (charset_list
);
9892 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9893 if (CHARSET_DIMENSION (charset
) != 1)
9894 error ("Dimension of charset %s is not one",
9895 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9897 charset_list
= XCDR (charset_list
);
9898 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9899 if (CHARSET_DIMENSION (charset
) != 2)
9900 error ("Dimension of charset %s is not two",
9901 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9903 charset_list
= XCDR (charset_list
);
9904 if (! NILP (charset_list
))
9906 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9907 if (CHARSET_DIMENSION (charset
) != 2)
9908 error ("Dimension of charset %s is not two",
9909 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9912 category
= coding_category_sjis
;
9913 Vsjis_coding_system
= name
;
9915 else if (EQ (coding_type
, Qbig5
))
9917 struct charset
*charset
;
9919 if (XINT (Flength (charset_list
)) != 2)
9920 error ("There should be just two charsets");
9922 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9923 if (CHARSET_DIMENSION (charset
) != 1)
9924 error ("Dimension of charset %s is not one",
9925 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9926 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9927 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9929 charset_list
= XCDR (charset_list
);
9930 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9931 if (CHARSET_DIMENSION (charset
) != 2)
9932 error ("Dimension of charset %s is not two",
9933 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9935 category
= coding_category_big5
;
9936 Vbig5_coding_system
= name
;
9938 else if (EQ (coding_type
, Qraw_text
))
9940 category
= coding_category_raw_text
;
9941 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9943 else if (EQ (coding_type
, Qutf_8
))
9947 if (nargs
< coding_arg_utf8_max
)
9950 bom
= args
[coding_arg_utf8_bom
];
9951 if (! NILP (bom
) && ! EQ (bom
, Qt
))
9955 CHECK_CODING_SYSTEM (val
);
9957 CHECK_CODING_SYSTEM (val
);
9959 ASET (attrs
, coding_attr_utf_bom
, bom
);
9961 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
9963 category
= (CONSP (bom
) ? coding_category_utf_8_auto
9964 : NILP (bom
) ? coding_category_utf_8_nosig
9965 : coding_category_utf_8_sig
);
9967 else if (EQ (coding_type
, Qundecided
))
9968 category
= coding_category_undecided
;
9970 error ("Invalid coding system type: %s",
9971 SDATA (SYMBOL_NAME (coding_type
)));
9973 ASET (attrs
, coding_attr_category
, make_number (category
));
9974 ASET (attrs
, coding_attr_plist
,
9976 Fcons (AREF (Vcoding_category_table
, category
),
9977 CODING_ATTR_PLIST (attrs
))));
9978 ASET (attrs
, coding_attr_plist
,
9979 Fcons (QCascii_compatible_p
,
9980 Fcons (CODING_ATTR_ASCII_COMPAT (attrs
),
9981 CODING_ATTR_PLIST (attrs
))));
9983 eol_type
= args
[coding_arg_eol_type
];
9984 if (! NILP (eol_type
)
9985 && ! EQ (eol_type
, Qunix
)
9986 && ! EQ (eol_type
, Qdos
)
9987 && ! EQ (eol_type
, Qmac
))
9988 error ("Invalid eol-type");
9990 aliases
= Fcons (name
, Qnil
);
9992 if (NILP (eol_type
))
9994 eol_type
= make_subsidiaries (name
);
9995 for (i
= 0; i
< 3; i
++)
9997 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
9999 this_name
= AREF (eol_type
, i
);
10000 this_aliases
= Fcons (this_name
, Qnil
);
10001 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
10002 this_spec
= Fmake_vector (make_number (3), attrs
);
10003 ASET (this_spec
, 1, this_aliases
);
10004 ASET (this_spec
, 2, this_eol_type
);
10005 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
10006 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
10007 val
= Fassoc (Fsymbol_name (this_name
), Vcoding_system_alist
);
10009 Vcoding_system_alist
10010 = Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
10011 Vcoding_system_alist
);
10015 spec_vec
= Fmake_vector (make_number (3), attrs
);
10016 ASET (spec_vec
, 1, aliases
);
10017 ASET (spec_vec
, 2, eol_type
);
10019 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
10020 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
10021 val
= Fassoc (Fsymbol_name (name
), Vcoding_system_alist
);
10023 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
10024 Vcoding_system_alist
);
10027 int id
= coding_categories
[category
].id
;
10029 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
10030 setup_coding_system (name
, &coding_categories
[category
]);
10036 return Fsignal (Qwrong_number_of_arguments
,
10037 Fcons (intern ("define-coding-system-internal"),
10038 make_number (nargs
)));
10042 DEFUN ("coding-system-put", Fcoding_system_put
, Scoding_system_put
,
10044 doc
: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10045 (Lisp_Object coding_system
, Lisp_Object prop
, Lisp_Object val
)
10047 Lisp_Object spec
, attrs
;
10049 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10050 attrs
= AREF (spec
, 0);
10051 if (EQ (prop
, QCmnemonic
))
10053 if (! STRINGP (val
))
10054 CHECK_CHARACTER (val
);
10055 ASET (attrs
, coding_attr_mnemonic
, val
);
10057 else if (EQ (prop
, QCdefault_char
))
10060 val
= make_number (' ');
10062 CHECK_CHARACTER (val
);
10063 ASET (attrs
, coding_attr_default_char
, val
);
10065 else if (EQ (prop
, QCdecode_translation_table
))
10067 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10068 CHECK_SYMBOL (val
);
10069 ASET (attrs
, coding_attr_decode_tbl
, val
);
10071 else if (EQ (prop
, QCencode_translation_table
))
10073 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10074 CHECK_SYMBOL (val
);
10075 ASET (attrs
, coding_attr_encode_tbl
, val
);
10077 else if (EQ (prop
, QCpost_read_conversion
))
10079 CHECK_SYMBOL (val
);
10080 ASET (attrs
, coding_attr_post_read
, val
);
10082 else if (EQ (prop
, QCpre_write_conversion
))
10084 CHECK_SYMBOL (val
);
10085 ASET (attrs
, coding_attr_pre_write
, val
);
10087 else if (EQ (prop
, QCascii_compatible_p
))
10089 ASET (attrs
, coding_attr_ascii_compat
, val
);
10092 ASET (attrs
, coding_attr_plist
,
10093 Fplist_put (CODING_ATTR_PLIST (attrs
), prop
, val
));
10098 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
10099 Sdefine_coding_system_alias
, 2, 2, 0,
10100 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10101 (Lisp_Object alias
, Lisp_Object coding_system
)
10103 Lisp_Object spec
, aliases
, eol_type
, val
;
10105 CHECK_SYMBOL (alias
);
10106 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10107 aliases
= AREF (spec
, 1);
10108 /* ALIASES should be a list of length more than zero, and the first
10109 element is a base coding system. Append ALIAS at the tail of the
10111 while (!NILP (XCDR (aliases
)))
10112 aliases
= XCDR (aliases
);
10113 XSETCDR (aliases
, Fcons (alias
, Qnil
));
10115 eol_type
= AREF (spec
, 2);
10116 if (VECTORP (eol_type
))
10118 Lisp_Object subsidiaries
;
10121 subsidiaries
= make_subsidiaries (alias
);
10122 for (i
= 0; i
< 3; i
++)
10123 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
10124 AREF (eol_type
, i
));
10127 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
10128 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
10129 val
= Fassoc (Fsymbol_name (alias
), Vcoding_system_alist
);
10131 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
10132 Vcoding_system_alist
);
10137 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
10139 doc
: /* Return the base of CODING-SYSTEM.
10140 Any alias or subsidiary coding system is not a base coding system. */)
10141 (Lisp_Object coding_system
)
10143 Lisp_Object spec
, attrs
;
10145 if (NILP (coding_system
))
10146 return (Qno_conversion
);
10147 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10148 attrs
= AREF (spec
, 0);
10149 return CODING_ATTR_BASE_NAME (attrs
);
10152 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
10154 doc
: "Return the property list of CODING-SYSTEM.")
10155 (Lisp_Object coding_system
)
10157 Lisp_Object spec
, attrs
;
10159 if (NILP (coding_system
))
10160 coding_system
= Qno_conversion
;
10161 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10162 attrs
= AREF (spec
, 0);
10163 return CODING_ATTR_PLIST (attrs
);
10167 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
10169 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
10170 (Lisp_Object coding_system
)
10174 if (NILP (coding_system
))
10175 coding_system
= Qno_conversion
;
10176 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10177 return AREF (spec
, 1);
10180 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
10181 Scoding_system_eol_type
, 1, 1, 0,
10182 doc
: /* Return eol-type of CODING-SYSTEM.
10183 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10185 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10186 and CR respectively.
10188 A vector value indicates that a format of end-of-line should be
10189 detected automatically. Nth element of the vector is the subsidiary
10190 coding system whose eol-type is N. */)
10191 (Lisp_Object coding_system
)
10193 Lisp_Object spec
, eol_type
;
10196 if (NILP (coding_system
))
10197 coding_system
= Qno_conversion
;
10198 if (! CODING_SYSTEM_P (coding_system
))
10200 spec
= CODING_SYSTEM_SPEC (coding_system
);
10201 eol_type
= AREF (spec
, 2);
10202 if (VECTORP (eol_type
))
10203 return Fcopy_sequence (eol_type
);
10204 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
10205 return make_number (n
);
10211 /*** 9. Post-amble ***/
10214 init_coding_once (void)
10218 for (i
= 0; i
< coding_category_max
; i
++)
10220 coding_categories
[i
].id
= -1;
10221 coding_priorities
[i
] = i
;
10224 /* ISO2022 specific initialize routine. */
10225 for (i
= 0; i
< 0x20; i
++)
10226 iso_code_class
[i
] = ISO_control_0
;
10227 for (i
= 0x21; i
< 0x7F; i
++)
10228 iso_code_class
[i
] = ISO_graphic_plane_0
;
10229 for (i
= 0x80; i
< 0xA0; i
++)
10230 iso_code_class
[i
] = ISO_control_1
;
10231 for (i
= 0xA1; i
< 0xFF; i
++)
10232 iso_code_class
[i
] = ISO_graphic_plane_1
;
10233 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
10234 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
10235 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
10236 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
10237 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
10238 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
10239 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
10240 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
10241 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
10243 for (i
= 0; i
< 256; i
++)
10245 emacs_mule_bytes
[i
] = 1;
10247 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
10248 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
10249 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
10250 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
10256 syms_of_coding (void)
10258 staticpro (&Vcoding_system_hash_table
);
10260 Lisp_Object args
[2];
10263 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
10266 staticpro (&Vsjis_coding_system
);
10267 Vsjis_coding_system
= Qnil
;
10269 staticpro (&Vbig5_coding_system
);
10270 Vbig5_coding_system
= Qnil
;
10272 staticpro (&Vcode_conversion_reused_workbuf
);
10273 Vcode_conversion_reused_workbuf
= Qnil
;
10275 staticpro (&Vcode_conversion_workbuf_name
);
10276 Vcode_conversion_workbuf_name
= build_pure_c_string (" *code-conversion-work*");
10278 reused_workbuf_in_use
= 0;
10280 DEFSYM (Qcharset
, "charset");
10281 DEFSYM (Qtarget_idx
, "target-idx");
10282 DEFSYM (Qcoding_system_history
, "coding-system-history");
10283 Fset (Qcoding_system_history
, Qnil
);
10285 /* Target FILENAME is the first argument. */
10286 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
10287 /* Target FILENAME is the third argument. */
10288 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
10290 DEFSYM (Qcall_process
, "call-process");
10291 /* Target PROGRAM is the first argument. */
10292 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
10294 DEFSYM (Qcall_process_region
, "call-process-region");
10295 /* Target PROGRAM is the third argument. */
10296 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
10298 DEFSYM (Qstart_process
, "start-process");
10299 /* Target PROGRAM is the third argument. */
10300 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
10302 DEFSYM (Qopen_network_stream
, "open-network-stream");
10303 /* Target SERVICE is the fourth argument. */
10304 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
10306 DEFSYM (Qcoding_system
, "coding-system");
10307 DEFSYM (Qcoding_aliases
, "coding-aliases");
10309 DEFSYM (Qeol_type
, "eol-type");
10310 DEFSYM (Qunix
, "unix");
10311 DEFSYM (Qdos
, "dos");
10313 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
10314 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
10315 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
10316 DEFSYM (Qdefault_char
, "default-char");
10317 DEFSYM (Qundecided
, "undecided");
10318 DEFSYM (Qno_conversion
, "no-conversion");
10319 DEFSYM (Qraw_text
, "raw-text");
10321 DEFSYM (Qiso_2022
, "iso-2022");
10323 DEFSYM (Qutf_8
, "utf-8");
10324 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
10326 #if defined (WINDOWSNT) || defined (CYGWIN)
10327 /* No, not utf-16-le: that one has a BOM. */
10328 DEFSYM (Qutf_16le
, "utf-16le");
10331 DEFSYM (Qutf_16
, "utf-16");
10332 DEFSYM (Qbig
, "big");
10333 DEFSYM (Qlittle
, "little");
10335 DEFSYM (Qshift_jis
, "shift-jis");
10336 DEFSYM (Qbig5
, "big5");
10338 DEFSYM (Qcoding_system_p
, "coding-system-p");
10340 DEFSYM (Qcoding_system_error
, "coding-system-error");
10341 Fput (Qcoding_system_error
, Qerror_conditions
,
10342 listn (CONSTYPE_PURE
, 2, Qcoding_system_error
, Qerror
));
10343 Fput (Qcoding_system_error
, Qerror_message
,
10344 build_pure_c_string ("Invalid coding system"));
10346 /* Intern this now in case it isn't already done.
10347 Setting this variable twice is harmless.
10348 But don't staticpro it here--that is done in alloc.c. */
10349 Qchar_table_extra_slots
= intern_c_string ("char-table-extra-slots");
10351 DEFSYM (Qtranslation_table
, "translation-table");
10352 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (2));
10353 DEFSYM (Qtranslation_table_id
, "translation-table-id");
10354 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
10355 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
10357 DEFSYM (Qvalid_codes
, "valid-codes");
10359 DEFSYM (Qemacs_mule
, "emacs-mule");
10361 DEFSYM (QCcategory
, ":category");
10362 DEFSYM (QCmnemonic
, ":mnemonic");
10363 DEFSYM (QCdefault_char
, ":default-char");
10364 DEFSYM (QCdecode_translation_table
, ":decode-translation-table");
10365 DEFSYM (QCencode_translation_table
, ":encode-translation-table");
10366 DEFSYM (QCpost_read_conversion
, ":post-read-conversion");
10367 DEFSYM (QCpre_write_conversion
, ":pre-write-conversion");
10368 DEFSYM (QCascii_compatible_p
, ":ascii-compatible-p");
10370 Vcoding_category_table
10371 = Fmake_vector (make_number (coding_category_max
), Qnil
);
10372 staticpro (&Vcoding_category_table
);
10373 /* Followings are target of code detection. */
10374 ASET (Vcoding_category_table
, coding_category_iso_7
,
10375 intern_c_string ("coding-category-iso-7"));
10376 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
10377 intern_c_string ("coding-category-iso-7-tight"));
10378 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
10379 intern_c_string ("coding-category-iso-8-1"));
10380 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
10381 intern_c_string ("coding-category-iso-8-2"));
10382 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
10383 intern_c_string ("coding-category-iso-7-else"));
10384 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
10385 intern_c_string ("coding-category-iso-8-else"));
10386 ASET (Vcoding_category_table
, coding_category_utf_8_auto
,
10387 intern_c_string ("coding-category-utf-8-auto"));
10388 ASET (Vcoding_category_table
, coding_category_utf_8_nosig
,
10389 intern_c_string ("coding-category-utf-8"));
10390 ASET (Vcoding_category_table
, coding_category_utf_8_sig
,
10391 intern_c_string ("coding-category-utf-8-sig"));
10392 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
10393 intern_c_string ("coding-category-utf-16-be"));
10394 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
10395 intern_c_string ("coding-category-utf-16-auto"));
10396 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
10397 intern_c_string ("coding-category-utf-16-le"));
10398 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
10399 intern_c_string ("coding-category-utf-16-be-nosig"));
10400 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
10401 intern_c_string ("coding-category-utf-16-le-nosig"));
10402 ASET (Vcoding_category_table
, coding_category_charset
,
10403 intern_c_string ("coding-category-charset"));
10404 ASET (Vcoding_category_table
, coding_category_sjis
,
10405 intern_c_string ("coding-category-sjis"));
10406 ASET (Vcoding_category_table
, coding_category_big5
,
10407 intern_c_string ("coding-category-big5"));
10408 ASET (Vcoding_category_table
, coding_category_ccl
,
10409 intern_c_string ("coding-category-ccl"));
10410 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
10411 intern_c_string ("coding-category-emacs-mule"));
10412 /* Followings are NOT target of code detection. */
10413 ASET (Vcoding_category_table
, coding_category_raw_text
,
10414 intern_c_string ("coding-category-raw-text"));
10415 ASET (Vcoding_category_table
, coding_category_undecided
,
10416 intern_c_string ("coding-category-undecided"));
10418 DEFSYM (Qinsufficient_source
, "insufficient-source");
10419 DEFSYM (Qinconsistent_eol
, "inconsistent-eol");
10420 DEFSYM (Qinvalid_source
, "invalid-source");
10421 DEFSYM (Qinterrupted
, "interrupted");
10422 DEFSYM (Qinsufficient_memory
, "insufficient-memory");
10423 DEFSYM (Qcoding_system_define_form
, "coding-system-define-form");
10425 defsubr (&Scoding_system_p
);
10426 defsubr (&Sread_coding_system
);
10427 defsubr (&Sread_non_nil_coding_system
);
10428 defsubr (&Scheck_coding_system
);
10429 defsubr (&Sdetect_coding_region
);
10430 defsubr (&Sdetect_coding_string
);
10431 defsubr (&Sfind_coding_systems_region_internal
);
10432 defsubr (&Sunencodable_char_position
);
10433 defsubr (&Scheck_coding_systems_region
);
10434 defsubr (&Sdecode_coding_region
);
10435 defsubr (&Sencode_coding_region
);
10436 defsubr (&Sdecode_coding_string
);
10437 defsubr (&Sencode_coding_string
);
10438 defsubr (&Sdecode_sjis_char
);
10439 defsubr (&Sencode_sjis_char
);
10440 defsubr (&Sdecode_big5_char
);
10441 defsubr (&Sencode_big5_char
);
10442 defsubr (&Sset_terminal_coding_system_internal
);
10443 defsubr (&Sset_safe_terminal_coding_system_internal
);
10444 defsubr (&Sterminal_coding_system
);
10445 defsubr (&Sset_keyboard_coding_system_internal
);
10446 defsubr (&Skeyboard_coding_system
);
10447 defsubr (&Sfind_operation_coding_system
);
10448 defsubr (&Sset_coding_system_priority
);
10449 defsubr (&Sdefine_coding_system_internal
);
10450 defsubr (&Sdefine_coding_system_alias
);
10451 defsubr (&Scoding_system_put
);
10452 defsubr (&Scoding_system_base
);
10453 defsubr (&Scoding_system_plist
);
10454 defsubr (&Scoding_system_aliases
);
10455 defsubr (&Scoding_system_eol_type
);
10456 defsubr (&Scoding_system_priority_list
);
10458 DEFVAR_LISP ("coding-system-list", Vcoding_system_list
,
10459 doc
: /* List of coding systems.
10461 Do not alter the value of this variable manually. This variable should be
10462 updated by the functions `define-coding-system' and
10463 `define-coding-system-alias'. */);
10464 Vcoding_system_list
= Qnil
;
10466 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist
,
10467 doc
: /* Alist of coding system names.
10468 Each element is one element list of coding system name.
10469 This variable is given to `completing-read' as COLLECTION argument.
10471 Do not alter the value of this variable manually. This variable should be
10472 updated by the functions `make-coding-system' and
10473 `define-coding-system-alias'. */);
10474 Vcoding_system_alist
= Qnil
;
10476 DEFVAR_LISP ("coding-category-list", Vcoding_category_list
,
10477 doc
: /* List of coding-categories (symbols) ordered by priority.
10479 On detecting a coding system, Emacs tries code detection algorithms
10480 associated with each coding-category one by one in this order. When
10481 one algorithm agrees with a byte sequence of source text, the coding
10482 system bound to the corresponding coding-category is selected.
10484 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10488 Vcoding_category_list
= Qnil
;
10489 for (i
= coding_category_max
- 1; i
>= 0; i
--)
10490 Vcoding_category_list
10491 = Fcons (AREF (Vcoding_category_table
, i
),
10492 Vcoding_category_list
);
10495 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read
,
10496 doc
: /* Specify the coding system for read operations.
10497 It is useful to bind this variable with `let', but do not set it globally.
10498 If the value is a coding system, it is used for decoding on read operation.
10499 If not, an appropriate element is used from one of the coding system alists.
10500 There are three such tables: `file-coding-system-alist',
10501 `process-coding-system-alist', and `network-coding-system-alist'. */);
10502 Vcoding_system_for_read
= Qnil
;
10504 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write
,
10505 doc
: /* Specify the coding system for write operations.
10506 Programs bind this variable with `let', but you should not set it globally.
10507 If the value is a coding system, it is used for encoding of output,
10508 when writing it to a file and when sending it to a file or subprocess.
10510 If this does not specify a coding system, an appropriate element
10511 is used from one of the coding system alists.
10512 There are three such tables: `file-coding-system-alist',
10513 `process-coding-system-alist', and `network-coding-system-alist'.
10514 For output to files, if the above procedure does not specify a coding system,
10515 the value of `buffer-file-coding-system' is used. */);
10516 Vcoding_system_for_write
= Qnil
;
10518 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used
,
10520 Coding system used in the latest file or process I/O. */);
10521 Vlast_coding_system_used
= Qnil
;
10523 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error
,
10525 Error status of the last code conversion.
10527 When an error was detected in the last code conversion, this variable
10528 is set to one of the following symbols.
10529 `insufficient-source'
10533 `insufficient-memory'
10534 When no error was detected, the value doesn't change. So, to check
10535 the error status of a code conversion by this variable, you must
10536 explicitly set this variable to nil before performing code
10538 Vlast_code_conversion_error
= Qnil
;
10540 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion
,
10542 *Non-nil means always inhibit code conversion of end-of-line format.
10543 See info node `Coding Systems' and info node `Text and Binary' concerning
10544 such conversion. */);
10545 inhibit_eol_conversion
= 0;
10547 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system
,
10549 Non-nil means process buffer inherits coding system of process output.
10550 Bind it to t if the process output is to be treated as if it were a file
10551 read from some filesystem. */);
10552 inherit_process_coding_system
= 0;
10554 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist
,
10556 Alist to decide a coding system to use for a file I/O operation.
10557 The format is ((PATTERN . VAL) ...),
10558 where PATTERN is a regular expression matching a file name,
10559 VAL is a coding system, a cons of coding systems, or a function symbol.
10560 If VAL is a coding system, it is used for both decoding and encoding
10562 If VAL is a cons of coding systems, the car part is used for decoding,
10563 and the cdr part is used for encoding.
10564 If VAL is a function symbol, the function must return a coding system
10565 or a cons of coding systems which are used as above. The function is
10566 called with an argument that is a list of the arguments with which
10567 `find-operation-coding-system' was called. If the function can't decide
10568 a coding system, it can return `undecided' so that the normal
10569 code-detection is performed.
10571 See also the function `find-operation-coding-system'
10572 and the variable `auto-coding-alist'. */);
10573 Vfile_coding_system_alist
= Qnil
;
10575 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist
,
10577 Alist to decide a coding system to use for a process I/O operation.
10578 The format is ((PATTERN . VAL) ...),
10579 where PATTERN is a regular expression matching a program name,
10580 VAL is a coding system, a cons of coding systems, or a function symbol.
10581 If VAL is a coding system, it is used for both decoding what received
10582 from the program and encoding what sent to the program.
10583 If VAL is a cons of coding systems, the car part is used for decoding,
10584 and the cdr part is used for encoding.
10585 If VAL is a function symbol, the function must return a coding system
10586 or a cons of coding systems which are used as above.
10588 See also the function `find-operation-coding-system'. */);
10589 Vprocess_coding_system_alist
= Qnil
;
10591 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist
,
10593 Alist to decide a coding system to use for a network I/O operation.
10594 The format is ((PATTERN . VAL) ...),
10595 where PATTERN is a regular expression matching a network service name
10596 or is a port number to connect to,
10597 VAL is a coding system, a cons of coding systems, or a function symbol.
10598 If VAL is a coding system, it is used for both decoding what received
10599 from the network stream and encoding what sent to the network stream.
10600 If VAL is a cons of coding systems, the car part is used for decoding,
10601 and the cdr part is used for encoding.
10602 If VAL is a function symbol, the function must return a coding system
10603 or a cons of coding systems which are used as above.
10605 See also the function `find-operation-coding-system'. */);
10606 Vnetwork_coding_system_alist
= Qnil
;
10608 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system
,
10609 doc
: /* Coding system to use with system messages.
10610 Also used for decoding keyboard input on X Window system. */);
10611 Vlocale_coding_system
= Qnil
;
10613 /* The eol mnemonics are reset in startup.el system-dependently. */
10614 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix
,
10616 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10617 eol_mnemonic_unix
= build_pure_c_string (":");
10619 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos
,
10621 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10622 eol_mnemonic_dos
= build_pure_c_string ("\\");
10624 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac
,
10626 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10627 eol_mnemonic_mac
= build_pure_c_string ("/");
10629 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided
,
10631 *String displayed in mode line when end-of-line format is not yet determined. */);
10632 eol_mnemonic_undecided
= build_pure_c_string (":");
10634 DEFVAR_LISP ("enable-character-translation", Venable_character_translation
,
10636 *Non-nil enables character translation while encoding and decoding. */);
10637 Venable_character_translation
= Qt
;
10639 DEFVAR_LISP ("standard-translation-table-for-decode",
10640 Vstandard_translation_table_for_decode
,
10641 doc
: /* Table for translating characters while decoding. */);
10642 Vstandard_translation_table_for_decode
= Qnil
;
10644 DEFVAR_LISP ("standard-translation-table-for-encode",
10645 Vstandard_translation_table_for_encode
,
10646 doc
: /* Table for translating characters while encoding. */);
10647 Vstandard_translation_table_for_encode
= Qnil
;
10649 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table
,
10650 doc
: /* Alist of charsets vs revision numbers.
10651 While encoding, if a charset (car part of an element) is found,
10652 designate it with the escape sequence identifying revision (cdr part
10653 of the element). */);
10654 Vcharset_revision_table
= Qnil
;
10656 DEFVAR_LISP ("default-process-coding-system",
10657 Vdefault_process_coding_system
,
10658 doc
: /* Cons of coding systems used for process I/O by default.
10659 The car part is used for decoding a process output,
10660 the cdr part is used for encoding a text to be sent to a process. */);
10661 Vdefault_process_coding_system
= Qnil
;
10663 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table
,
10665 Table of extra Latin codes in the range 128..159 (inclusive).
10666 This is a vector of length 256.
10667 If Nth element is non-nil, the existence of code N in a file
10668 \(or output of subprocess) doesn't prevent it to be detected as
10669 a coding system of ISO 2022 variant which has a flag
10670 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10671 or reading output of a subprocess.
10672 Only 128th through 159th elements have a meaning. */);
10673 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
10675 DEFVAR_LISP ("select-safe-coding-system-function",
10676 Vselect_safe_coding_system_function
,
10678 Function to call to select safe coding system for encoding a text.
10680 If set, this function is called to force a user to select a proper
10681 coding system which can encode the text in the case that a default
10682 coding system used in each operation can't encode the text. The
10683 function should take care that the buffer is not modified while
10684 the coding system is being selected.
10686 The default value is `select-safe-coding-system' (which see). */);
10687 Vselect_safe_coding_system_function
= Qnil
;
10689 DEFVAR_BOOL ("coding-system-require-warning",
10690 coding_system_require_warning
,
10691 doc
: /* Internal use only.
10692 If non-nil, on writing a file, `select-safe-coding-system-function' is
10693 called even if `coding-system-for-write' is non-nil. The command
10694 `universal-coding-system-argument' binds this variable to t temporarily. */);
10695 coding_system_require_warning
= 0;
10698 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10699 inhibit_iso_escape_detection
,
10701 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10703 When Emacs reads text, it tries to detect how the text is encoded.
10704 This code detection is sensitive to escape sequences. If Emacs sees
10705 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10706 of the ISO2022 encodings, and decodes text by the corresponding coding
10707 system (e.g. `iso-2022-7bit').
10709 However, there may be a case that you want to read escape sequences in
10710 a file as is. In such a case, you can set this variable to non-nil.
10711 Then the code detection will ignore any escape sequences, and no text is
10712 detected as encoded in some ISO-2022 encoding. The result is that all
10713 escape sequences become visible in a buffer.
10715 The default value is nil, and it is strongly recommended not to change
10716 it. That is because many Emacs Lisp source files that contain
10717 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10718 in Emacs's distribution, and they won't be decoded correctly on
10719 reading if you suppress escape sequence detection.
10721 The other way to read escape sequences in a file without decoding is
10722 to explicitly specify some coding system that doesn't use ISO-2022
10723 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10724 inhibit_iso_escape_detection
= 0;
10726 DEFVAR_BOOL ("inhibit-null-byte-detection",
10727 inhibit_null_byte_detection
,
10728 doc
: /* If non-nil, Emacs ignores null bytes on code detection.
10729 By default, Emacs treats it as binary data, and does not attempt to
10730 decode it. The effect is as if you specified `no-conversion' for
10733 Set this to non-nil when a regular text happens to include null bytes.
10734 Examples are Index nodes of Info files and null-byte delimited output
10735 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10736 decode text as usual. */);
10737 inhibit_null_byte_detection
= 0;
10739 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input
,
10740 doc
: /* Char table for translating self-inserting characters.
10741 This is applied to the result of input methods, not their input.
10742 See also `keyboard-translate-table'.
10744 Use of this variable for character code unification was rendered
10745 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10746 internal character representation. */);
10747 Vtranslation_table_for_input
= Qnil
;
10750 Lisp_Object args
[coding_arg_max
];
10751 Lisp_Object plist
[16];
10754 for (i
= 0; i
< coding_arg_max
; i
++)
10757 plist
[0] = intern_c_string (":name");
10758 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
10759 plist
[2] = intern_c_string (":mnemonic");
10760 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
10761 plist
[4] = intern_c_string (":coding-type");
10762 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
10763 plist
[6] = intern_c_string (":ascii-compatible-p");
10764 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
10765 plist
[8] = intern_c_string (":default-char");
10766 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
10767 plist
[10] = intern_c_string (":for-unibyte");
10768 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
10769 plist
[12] = intern_c_string (":docstring");
10770 plist
[13] = build_pure_c_string ("Do no conversion.\n\
10772 When you visit a file with this coding, the file is read into a\n\
10773 unibyte buffer as is, thus each byte of a file is treated as a\n\
10775 plist
[14] = intern_c_string (":eol-type");
10776 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
10777 args
[coding_arg_plist
] = Flist (16, plist
);
10778 Fdefine_coding_system_internal (coding_arg_max
, args
);
10780 plist
[1] = args
[coding_arg_name
] = Qundecided
;
10781 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('-');
10782 plist
[5] = args
[coding_arg_coding_type
] = Qundecided
;
10783 /* This is already set.
10784 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10785 plist
[8] = intern_c_string (":charset-list");
10786 plist
[9] = args
[coding_arg_charset_list
] = Fcons (Qascii
, Qnil
);
10787 plist
[11] = args
[coding_arg_for_unibyte
] = Qnil
;
10788 plist
[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10789 plist
[15] = args
[coding_arg_eol_type
] = Qnil
;
10790 args
[coding_arg_plist
] = Flist (16, plist
);
10791 Fdefine_coding_system_internal (coding_arg_max
, args
);
10794 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
10799 for (i
= 0; i
< coding_category_max
; i
++)
10800 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
10802 #if defined (DOS_NT)
10803 system_eol_type
= Qdos
;
10805 system_eol_type
= Qunix
;
10807 staticpro (&system_eol_type
);
10811 emacs_strerror (int error_number
)
10815 synchronize_system_messages_locale ();
10816 str
= strerror (error_number
);
10818 if (! NILP (Vlocale_coding_system
))
10820 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
10821 Vlocale_coding_system
,
10823 str
= SSDATA (dec
);