1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2015 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
156 detect_coding_XXX (struct coding_system
*coding
,
157 struct coding_detection_info
*detect_info
)
159 const unsigned char *src
= coding
->source
;
160 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
161 bool multibytep
= coding
->src_multibyte
;
162 ptrdiff_t consumed_chars
= 0;
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
172 if (! __C_conforms_to_XXX___ (c
))
174 if (! __C_strongly_suggests_XXX__ (c
))
175 found
= CATEGORY_MASK_XXX
;
177 /* The byte sequence is invalid for XXX. */
178 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
182 /* The source exhausted successfully. */
183 detect_info
->found
|= found
;
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
204 decode_coding_XXXX (struct coding_system
*coding
)
206 const unsigned char *src
= coding
->source
+ coding
->consumed
;
207 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base
;
212 /* A buffer to produce decoded characters. */
213 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
214 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
215 bool multibytep
= coding
->src_multibyte
;
220 if (charbuf
< charbuf_end
)
221 /* No more room to produce a decoded character. */
228 if (src_base
< src_end
229 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base
< src_end
&& charbuf
< charbuf_end
)
233 *charbuf
++ = *src_base
++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
237 /* Remember how many characters we produced. */
238 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
261 encode_coding_XXX (struct coding_system
*coding
)
263 bool multibytep
= coding
->dst_multibyte
;
264 int *charbuf
= coding
->charbuf
;
265 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
266 unsigned char *dst
= coding
->destination
+ coding
->produced
;
267 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
268 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
269 ptrdiff_t produced_chars
= 0;
271 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination
:
277 /* How many chars and bytes we produced. */
278 coding
->produced_char
+= produced_chars
;
279 coding
->produced
= dst
- coding
->destination
;
284 /*** 1. Preamble ***/
291 #endif /* HAVE_WCHAR_H */
294 #include "character.h"
298 #include "composite.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table
;
306 static Lisp_Object Qcoding_system
, Qeol_type
;
307 static Lisp_Object Qcoding_aliases
;
308 Lisp_Object Qunix
, Qdos
;
309 static Lisp_Object Qmac
;
310 Lisp_Object Qbuffer_file_coding_system
;
311 static Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
312 static Lisp_Object Qdefault_char
;
313 Lisp_Object Qno_conversion
, Qundecided
;
314 Lisp_Object Qcharset
, Qutf_8
;
315 static Lisp_Object Qiso_2022
;
316 static Lisp_Object Qutf_16
, Qshift_jis
, Qbig5
;
317 static Lisp_Object Qbig
, Qlittle
;
318 static Lisp_Object Qcoding_system_history
;
319 static Lisp_Object Qvalid_codes
;
320 static Lisp_Object QCcategory
, QCmnemonic
, QCdefault_char
;
321 static Lisp_Object QCdecode_translation_table
, QCencode_translation_table
;
322 static Lisp_Object QCpost_read_conversion
, QCpre_write_conversion
;
323 static Lisp_Object QCascii_compatible_p
;
325 Lisp_Object Qcall_process
, Qcall_process_region
;
326 Lisp_Object Qstart_process
, Qopen_network_stream
;
327 static Lisp_Object Qtarget_idx
;
329 static Lisp_Object Qinsufficient_source
, Qinvalid_source
, Qinterrupted
;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form
;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type
;
343 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule
, Qraw_text
;
348 Lisp_Object Qutf_8_emacs
;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le
;
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding
;
362 Lisp_Object Qtranslation_table
;
363 Lisp_Object Qtranslation_table_id
;
364 static Lisp_Object Qtranslation_table_for_decode
;
365 static Lisp_Object Qtranslation_table_for_encode
;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system
;
369 static Lisp_Object Vbig5_coding_system
;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1 \
399 : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
400 #define CODING_ISO_CMP_STATUS(coding) \
401 (&(coding)->spec.iso_2022.cmp_status)
402 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
403 ((coding)->spec.iso_2022.ctext_extended_segment_len)
404 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
405 ((coding)->spec.iso_2022.embedded_utf_8)
407 /* Control characters of ISO2022. */
408 /* code */ /* function */
409 #define ISO_CODE_SO 0x0E /* shift-out */
410 #define ISO_CODE_SI 0x0F /* shift-in */
411 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
412 #define ISO_CODE_ESC 0x1B /* escape */
413 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
414 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
415 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
417 /* All code (1-byte) of ISO2022 is classified into one of the
419 enum iso_code_class_type
421 ISO_control_0
, /* Control codes in the range
422 0x00..0x1F and 0x7F, except for the
423 following 5 codes. */
424 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
425 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
426 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
427 ISO_escape
, /* ISO_CODE_ESC (0x1B) */
428 ISO_control_1
, /* Control codes in the range
429 0x80..0x9F, except for the
430 following 3 codes. */
431 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
432 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
433 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
434 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
435 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
436 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
437 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
440 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
441 `iso-flags' attribute of an iso2022 coding system. */
443 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
444 instead of the correct short-form sequence (e.g. ESC $ A). */
445 #define CODING_ISO_FLAG_LONG_FORM 0x0001
447 /* If set, reset graphic planes and registers at end-of-line to the
449 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
451 /* If set, reset graphic planes and registers before any control
452 characters to the initial state. */
453 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
455 /* If set, encode by 7-bit environment. */
456 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
458 /* If set, use locking-shift function. */
459 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
461 /* If set, use single-shift function. Overwrite
462 CODING_ISO_FLAG_LOCKING_SHIFT. */
463 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
465 /* If set, use designation escape sequence. */
466 #define CODING_ISO_FLAG_DESIGNATION 0x0040
468 /* If set, produce revision number sequence. */
469 #define CODING_ISO_FLAG_REVISION 0x0080
471 /* If set, produce ISO6429's direction specifying sequence. */
472 #define CODING_ISO_FLAG_DIRECTION 0x0100
474 /* If set, assume designation states are reset at beginning of line on
476 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
478 /* If set, designation sequence should be placed at beginning of line
480 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
482 /* If set, do not encode unsafe characters on output. */
483 #define CODING_ISO_FLAG_SAFE 0x0800
485 /* If set, extra latin codes (128..159) are accepted as a valid code
487 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
489 #define CODING_ISO_FLAG_COMPOSITION 0x2000
491 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
493 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
495 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
497 #define CODING_ISO_FLAG_LEVEL_4 0x20000
499 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
501 /* A character to be produced on output if encoding of the original
502 character is prohibited by CODING_ISO_FLAG_SAFE. */
503 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
506 #define CODING_UTF_8_BOM(coding) \
507 ((coding)->spec.utf_8_bom)
510 #define CODING_UTF_16_BOM(coding) \
511 ((coding)->spec.utf_16.bom)
513 #define CODING_UTF_16_ENDIAN(coding) \
514 ((coding)->spec.utf_16.endian)
516 #define CODING_UTF_16_SURROGATE(coding) \
517 ((coding)->spec.utf_16.surrogate)
521 #define CODING_CCL_DECODER(coding) \
522 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
523 #define CODING_CCL_ENCODER(coding) \
524 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
525 #define CODING_CCL_VALIDS(coding) \
526 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
528 /* Index for each coding category in `coding_categories' */
532 coding_category_iso_7
,
533 coding_category_iso_7_tight
,
534 coding_category_iso_8_1
,
535 coding_category_iso_8_2
,
536 coding_category_iso_7_else
,
537 coding_category_iso_8_else
,
538 coding_category_utf_8_auto
,
539 coding_category_utf_8_nosig
,
540 coding_category_utf_8_sig
,
541 coding_category_utf_16_auto
,
542 coding_category_utf_16_be
,
543 coding_category_utf_16_le
,
544 coding_category_utf_16_be_nosig
,
545 coding_category_utf_16_le_nosig
,
546 coding_category_charset
,
547 coding_category_sjis
,
548 coding_category_big5
,
550 coding_category_emacs_mule
,
551 /* All above are targets of code detection. */
552 coding_category_raw_text
,
553 coding_category_undecided
,
557 /* Definitions of flag bits used in detect_coding_XXXX. */
558 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
559 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
560 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
561 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
562 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
563 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
564 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
565 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
566 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
567 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
568 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
569 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
570 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
571 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
572 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
573 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
574 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
575 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
576 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
577 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
579 /* This value is returned if detect_coding_mask () find nothing other
580 than ASCII characters. */
581 #define CATEGORY_MASK_ANY \
582 (CATEGORY_MASK_ISO_7 \
583 | CATEGORY_MASK_ISO_7_TIGHT \
584 | CATEGORY_MASK_ISO_8_1 \
585 | CATEGORY_MASK_ISO_8_2 \
586 | CATEGORY_MASK_ISO_7_ELSE \
587 | CATEGORY_MASK_ISO_8_ELSE \
588 | CATEGORY_MASK_UTF_8_AUTO \
589 | CATEGORY_MASK_UTF_8_NOSIG \
590 | CATEGORY_MASK_UTF_8_SIG \
591 | CATEGORY_MASK_UTF_16_AUTO \
592 | CATEGORY_MASK_UTF_16_BE \
593 | CATEGORY_MASK_UTF_16_LE \
594 | CATEGORY_MASK_UTF_16_BE_NOSIG \
595 | CATEGORY_MASK_UTF_16_LE_NOSIG \
596 | CATEGORY_MASK_CHARSET \
597 | CATEGORY_MASK_SJIS \
598 | CATEGORY_MASK_BIG5 \
599 | CATEGORY_MASK_CCL \
600 | CATEGORY_MASK_EMACS_MULE)
603 #define CATEGORY_MASK_ISO_7BIT \
604 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
606 #define CATEGORY_MASK_ISO_8BIT \
607 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
609 #define CATEGORY_MASK_ISO_ELSE \
610 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
612 #define CATEGORY_MASK_ISO_ESCAPE \
613 (CATEGORY_MASK_ISO_7 \
614 | CATEGORY_MASK_ISO_7_TIGHT \
615 | CATEGORY_MASK_ISO_7_ELSE \
616 | CATEGORY_MASK_ISO_8_ELSE)
618 #define CATEGORY_MASK_ISO \
619 ( CATEGORY_MASK_ISO_7BIT \
620 | CATEGORY_MASK_ISO_8BIT \
621 | CATEGORY_MASK_ISO_ELSE)
623 #define CATEGORY_MASK_UTF_16 \
624 (CATEGORY_MASK_UTF_16_AUTO \
625 | CATEGORY_MASK_UTF_16_BE \
626 | CATEGORY_MASK_UTF_16_LE \
627 | CATEGORY_MASK_UTF_16_BE_NOSIG \
628 | CATEGORY_MASK_UTF_16_LE_NOSIG)
630 #define CATEGORY_MASK_UTF_8 \
631 (CATEGORY_MASK_UTF_8_AUTO \
632 | CATEGORY_MASK_UTF_8_NOSIG \
633 | CATEGORY_MASK_UTF_8_SIG)
635 /* Table of coding categories (Lisp symbols). This variable is for
636 internal use only. */
637 static Lisp_Object Vcoding_category_table
;
639 /* Table of coding-categories ordered by priority. */
640 static enum coding_category coding_priorities
[coding_category_max
];
642 /* Nth element is a coding context for the coding system bound to the
643 Nth coding category. */
644 static struct coding_system coding_categories
[coding_category_max
];
646 /*** Commonly used macros and functions ***/
649 #define min(a, b) ((a) < (b) ? (a) : (b))
652 #define max(a, b) ((a) > (b) ? (a) : (b))
655 /* Encode a flag that can be nil, something else, or t as -1, 0, 1. */
658 encode_inhibit_flag (Lisp_Object flag
)
660 return NILP (flag
) ? -1 : EQ (flag
, Qt
);
663 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
664 1 means yes, -1 means no, 0 means ask the user variable VAR. */
667 inhibit_flag (int encoded_flag
, bool var
)
669 return 0 < encoded_flag
+ var
;
672 #define CODING_GET_INFO(coding, attrs, charset_list) \
674 (attrs) = CODING_ID_ATTRS ((coding)->id); \
675 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
679 CHECK_NATNUM_CAR (Lisp_Object x
)
681 Lisp_Object tmp
= XCAR (x
);
687 CHECK_NATNUM_CDR (Lisp_Object x
)
689 Lisp_Object tmp
= XCDR (x
);
695 /* Safely get one byte from the source text pointed by SRC which ends
696 at SRC_END, and set C to that byte. If there are not enough bytes
697 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
698 and a multibyte character is found at SRC, set C to the
699 negative value of the character code. The caller should declare
700 and set these variables appropriately in advance:
701 src, src_end, multibytep */
703 #define ONE_MORE_BYTE(c) \
705 if (src == src_end) \
707 if (src_base < src) \
708 record_conversion_result \
709 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
710 goto no_more_source; \
713 if (multibytep && (c & 0x80)) \
715 if ((c & 0xFE) == 0xC0) \
716 c = ((c & 1) << 6) | *src++; \
720 c = - string_char (src, &src, NULL); \
721 record_conversion_result \
722 (coding, CODING_RESULT_INVALID_SRC); \
728 /* Safely get two bytes from the source text pointed by SRC which ends
729 at SRC_END, and set C1 and C2 to those bytes while skipping the
730 heading multibyte characters. If there are not enough bytes in the
731 source, it jumps to 'no_more_source'. If MULTIBYTEP and
732 a multibyte character is found for C2, set C2 to the negative value
733 of the character code. The caller should declare and set these
734 variables appropriately in advance:
735 src, src_end, multibytep
736 It is intended that this macro is used in detect_coding_utf_16. */
738 #define TWO_MORE_BYTES(c1, c2) \
741 if (src == src_end) \
742 goto no_more_source; \
744 if (multibytep && (c1 & 0x80)) \
746 if ((c1 & 0xFE) == 0xC0) \
747 c1 = ((c1 & 1) << 6) | *src++; \
750 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
755 if (src == src_end) \
756 goto no_more_source; \
758 if (multibytep && (c2 & 0x80)) \
760 if ((c2 & 0xFE) == 0xC0) \
761 c2 = ((c2 & 1) << 6) | *src++; \
768 /* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
775 #define EMIT_ONE_ASCII_BYTE(c) \
782 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
791 /* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
793 store in an appropriate multibyte form. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
797 #define EMIT_ONE_BYTE(c) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
814 #define EMIT_TWO_BYTES(c1, c2) \
816 produced_chars += 2; \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
838 #define EMIT_THREE_BYTES(c1, c2, c3) \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
853 record_conversion_result (struct coding_system
*coding
,
854 enum coding_result_code result
)
856 coding
->result
= result
;
859 case CODING_RESULT_INSUFFICIENT_SRC
:
860 Vlast_code_conversion_error
= Qinsufficient_source
;
862 case CODING_RESULT_INVALID_SRC
:
863 Vlast_code_conversion_error
= Qinvalid_source
;
865 case CODING_RESULT_INTERRUPT
:
866 Vlast_code_conversion_error
= Qinterrupted
;
868 case CODING_RESULT_INSUFFICIENT_DST
:
869 /* Don't record this error in Vlast_code_conversion_error
870 because it happens just temporarily and is resolved when the
871 whole conversion is finished. */
873 case CODING_RESULT_SUCCESS
:
876 Vlast_code_conversion_error
= intern ("Unknown error");
880 /* These wrapper macros are used to preserve validity of pointers into
881 buffer text across calls to decode_char, encode_char, etc, which
882 could cause relocation of buffers if it loads a charset map,
883 because loading a charset map allocates large structures. */
885 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
889 charset_map_loaded = 0; \
890 c = DECODE_CHAR (charset, code); \
891 if (charset_map_loaded \
892 && (offset = coding_change_source (coding))) \
895 src_base += offset; \
900 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
904 charset_map_loaded = 0; \
905 code = ENCODE_CHAR (charset, c); \
906 if (charset_map_loaded \
907 && (offset = coding_change_destination (coding))) \
914 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
918 charset_map_loaded = 0; \
919 charset = char_charset (c, charset_list, code_return); \
920 if (charset_map_loaded \
921 && (offset = coding_change_destination (coding))) \
928 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
932 charset_map_loaded = 0; \
933 result = CHAR_CHARSET_P (c, charset); \
934 if (charset_map_loaded \
935 && (offset = coding_change_destination (coding))) \
943 /* If there are at least BYTES length of room at dst, allocate memory
944 for coding->destination and update dst and dst_end. We don't have
945 to take care of coding->source which will be relocated. It is
946 handled by calling coding_set_source in encode_coding. */
948 #define ASSURE_DESTINATION(bytes) \
950 if (dst + (bytes) >= dst_end) \
952 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
954 dst = alloc_destination (coding, more_bytes, dst); \
955 dst_end = coding->destination + coding->dst_bytes; \
960 /* Store multibyte form of the character C in P, and advance P to the
961 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
962 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
963 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
965 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
967 /* Return the character code of character whose multibyte form is at
968 P, and advance P to the end of the multibyte form. This used to be
969 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
970 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
972 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
974 /* Set coding->source from coding->src_object. */
977 coding_set_source (struct coding_system
*coding
)
979 if (BUFFERP (coding
->src_object
))
981 struct buffer
*buf
= XBUFFER (coding
->src_object
);
983 if (coding
->src_pos
< 0)
984 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
986 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
988 else if (STRINGP (coding
->src_object
))
990 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
994 /* Otherwise, the source is C string and is never relocated
995 automatically. Thus we don't have to update anything. */
1000 /* Set coding->source from coding->src_object, and return how many
1001 bytes coding->source was changed. */
1004 coding_change_source (struct coding_system
*coding
)
1006 const unsigned char *orig
= coding
->source
;
1007 coding_set_source (coding
);
1008 return coding
->source
- orig
;
1012 /* Set coding->destination from coding->dst_object. */
1015 coding_set_destination (struct coding_system
*coding
)
1017 if (BUFFERP (coding
->dst_object
))
1019 if (BUFFERP (coding
->src_object
) && coding
->src_pos
< 0)
1021 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- BEG_BYTE
;
1022 coding
->dst_bytes
= (GAP_END_ADDR
1023 - (coding
->src_bytes
- coding
->consumed
)
1024 - coding
->destination
);
1028 /* We are sure that coding->dst_pos_byte is before the gap
1030 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
1031 + coding
->dst_pos_byte
- BEG_BYTE
);
1032 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
1033 - coding
->destination
);
1038 /* Otherwise, the destination is C string and is never relocated
1039 automatically. Thus we don't have to update anything. */
1044 /* Set coding->destination from coding->dst_object, and return how
1045 many bytes coding->destination was changed. */
1048 coding_change_destination (struct coding_system
*coding
)
1050 const unsigned char *orig
= coding
->destination
;
1051 coding_set_destination (coding
);
1052 return coding
->destination
- orig
;
1057 coding_alloc_by_realloc (struct coding_system
*coding
, ptrdiff_t bytes
)
1059 if (STRING_BYTES_BOUND
- coding
->dst_bytes
< bytes
)
1061 coding
->destination
= xrealloc (coding
->destination
,
1062 coding
->dst_bytes
+ bytes
);
1063 coding
->dst_bytes
+= bytes
;
1067 coding_alloc_by_making_gap (struct coding_system
*coding
,
1068 ptrdiff_t gap_head_used
, ptrdiff_t bytes
)
1070 if (EQ (coding
->src_object
, coding
->dst_object
))
1072 /* The gap may contain the produced data at the head and not-yet
1073 consumed data at the tail. To preserve those data, we at
1074 first make the gap size to zero, then increase the gap
1076 ptrdiff_t add
= GAP_SIZE
;
1078 GPT
+= gap_head_used
, GPT_BYTE
+= gap_head_used
;
1079 GAP_SIZE
= 0; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
1081 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
1082 GPT
-= gap_head_used
, GPT_BYTE
-= gap_head_used
;
1085 make_gap_1 (XBUFFER (coding
->dst_object
), bytes
);
1089 static unsigned char *
1090 alloc_destination (struct coding_system
*coding
, ptrdiff_t nbytes
,
1093 ptrdiff_t offset
= dst
- coding
->destination
;
1095 if (BUFFERP (coding
->dst_object
))
1097 struct buffer
*buf
= XBUFFER (coding
->dst_object
);
1099 coding_alloc_by_making_gap (coding
, dst
- BUF_GPT_ADDR (buf
), nbytes
);
1102 coding_alloc_by_realloc (coding
, nbytes
);
1103 coding_set_destination (coding
);
1104 dst
= coding
->destination
+ offset
;
1108 /** Macros for annotations. */
1110 /* An annotation data is stored in the array coding->charbuf in this
1112 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1113 LENGTH is the number of elements in the annotation.
1114 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1115 NCHARS is the number of characters in the text annotated.
1117 The format of the following elements depend on ANNOTATION_MASK.
1119 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1121 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1123 NBYTES is the number of bytes specified in the header part of
1124 old-style emacs-mule encoding, or 0 for the other kind of
1127 METHOD is one of enum composition_method.
1129 Optional COMPOSITION-COMPONENTS are characters and composition
1132 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1135 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1136 recover from an invalid annotation, and should be skipped by
1137 produce_annotation. */
1139 /* Maximum length of the header of annotation data. */
1140 #define MAX_ANNOTATION_LENGTH 5
1142 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1144 *(buf)++ = -(len); \
1145 *(buf)++ = (mask); \
1146 *(buf)++ = (nchars); \
1147 coding->annotated = 1; \
1150 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1152 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1158 #define ADD_CHARSET_DATA(buf, nchars, id) \
1160 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1165 /* Bitmasks for coding->eol_seen. */
1167 #define EOL_SEEN_NONE 0
1168 #define EOL_SEEN_LF 1
1169 #define EOL_SEEN_CR 2
1170 #define EOL_SEEN_CRLF 4
1173 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1180 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1181 Return true if a text is encoded in UTF-8. */
1183 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1184 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1185 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1186 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1187 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1188 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1190 #define UTF_8_BOM_1 0xEF
1191 #define UTF_8_BOM_2 0xBB
1192 #define UTF_8_BOM_3 0xBF
1194 /* Unlike the other detect_coding_XXX, this function counts the number
1195 of characters and checks the EOL format. */
1198 detect_coding_utf_8 (struct coding_system
*coding
,
1199 struct coding_detection_info
*detect_info
)
1201 const unsigned char *src
= coding
->source
, *src_base
;
1202 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1203 bool multibytep
= coding
->src_multibyte
;
1204 ptrdiff_t consumed_chars
= 0;
1206 ptrdiff_t nchars
= coding
->head_ascii
;
1207 int eol_seen
= coding
->eol_seen
;
1209 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1210 /* A coding system of this category is always ASCII compatible. */
1213 if (src
== coding
->source
/* BOM should be at the head. */
1214 && src
+ 3 < src_end
/* BOM is 3-byte long. */
1215 && src
[0] == UTF_8_BOM_1
1216 && src
[1] == UTF_8_BOM_2
1217 && src
[2] == UTF_8_BOM_3
)
1226 int c
, c1
, c2
, c3
, c4
;
1230 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1235 if (src
< src_end
&& *src
== '\n')
1237 eol_seen
|= EOL_SEEN_CRLF
;
1242 eol_seen
|= EOL_SEEN_CR
;
1245 eol_seen
|= EOL_SEEN_LF
;
1249 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1251 if (UTF_8_2_OCTET_LEADING_P (c
))
1257 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1259 if (UTF_8_3_OCTET_LEADING_P (c
))
1265 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1267 if (UTF_8_4_OCTET_LEADING_P (c
))
1273 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1275 if (UTF_8_5_OCTET_LEADING_P (c
))
1282 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1286 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1288 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1293 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1294 detect_info
->found
|= CATEGORY_MASK_UTF_8_AUTO
| CATEGORY_MASK_UTF_8_SIG
| CATEGORY_MASK_UTF_8_NOSIG
;
1298 detect_info
->rejected
|= CATEGORY_MASK_UTF_8_SIG
;
1299 if (nchars
< src_end
- coding
->source
)
1300 /* The found characters are less than source bytes, which
1301 means that we found a valid non-ASCII characters. */
1302 detect_info
->found
|= CATEGORY_MASK_UTF_8_AUTO
| CATEGORY_MASK_UTF_8_NOSIG
;
1304 coding
->detected_utf8_bytes
= src_base
- coding
->source
;
1305 coding
->detected_utf8_chars
= nchars
;
1311 decode_coding_utf_8 (struct coding_system
*coding
)
1313 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1314 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1315 const unsigned char *src_base
;
1316 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1317 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
1318 ptrdiff_t consumed_chars
= 0, consumed_chars_base
= 0;
1319 bool multibytep
= coding
->src_multibyte
;
1320 enum utf_bom_type bom
= CODING_UTF_8_BOM (coding
);
1322 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1323 int byte_after_cr
= -1;
1325 if (bom
!= utf_without_bom
)
1331 if (! UTF_8_3_OCTET_LEADING_P (c1
))
1336 if (! UTF_8_EXTRA_OCTET_P (c2
))
1341 if (! UTF_8_EXTRA_OCTET_P (c3
))
1345 if ((c1
!= UTF_8_BOM_1
)
1346 || (c2
!= UTF_8_BOM_2
) || (c3
!= UTF_8_BOM_3
))
1349 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1354 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1358 int c
, c1
, c2
, c3
, c4
, c5
;
1361 consumed_chars_base
= consumed_chars
;
1363 if (charbuf
>= charbuf_end
)
1365 if (byte_after_cr
>= 0)
1370 /* In the simple case, rapidly handle ordinary characters */
1371 if (multibytep
&& ! eol_dos
1372 && charbuf
< charbuf_end
- 6 && src
< src_end
- 6)
1374 while (charbuf
< charbuf_end
- 6 && src
< src_end
- 6)
1404 /* If we handled at least one character, restart the main loop. */
1405 if (src
!= src_base
)
1409 if (byte_after_cr
>= 0)
1410 c1
= byte_after_cr
, byte_after_cr
= -1;
1417 else if (UTF_8_1_OCTET_P (c1
))
1419 if (eol_dos
&& c1
== '\r')
1420 ONE_MORE_BYTE (byte_after_cr
);
1426 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1428 if (UTF_8_2_OCTET_LEADING_P (c1
))
1430 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1431 /* Reject overlong sequences here and below. Encoders
1432 producing them are incorrect, they can be misleading,
1433 and they mess up read/write invariance. */
1440 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1442 if (UTF_8_3_OCTET_LEADING_P (c1
))
1444 c
= (((c1
& 0xF) << 12)
1445 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1447 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1453 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1455 if (UTF_8_4_OCTET_LEADING_P (c1
))
1457 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1458 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1465 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1467 if (UTF_8_5_OCTET_LEADING_P (c1
))
1469 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1470 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1472 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1487 consumed_chars
= consumed_chars_base
;
1489 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1494 coding
->consumed_char
+= consumed_chars_base
;
1495 coding
->consumed
= src_base
- coding
->source
;
1496 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1501 encode_coding_utf_8 (struct coding_system
*coding
)
1503 bool multibytep
= coding
->dst_multibyte
;
1504 int *charbuf
= coding
->charbuf
;
1505 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1506 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1507 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1508 ptrdiff_t produced_chars
= 0;
1511 if (CODING_UTF_8_BOM (coding
) == utf_with_bom
)
1513 ASSURE_DESTINATION (3);
1514 EMIT_THREE_BYTES (UTF_8_BOM_1
, UTF_8_BOM_2
, UTF_8_BOM_3
);
1515 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1520 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1522 while (charbuf
< charbuf_end
)
1524 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1526 ASSURE_DESTINATION (safe_room
);
1528 if (CHAR_BYTE8_P (c
))
1530 c
= CHAR_TO_BYTE8 (c
);
1535 CHAR_STRING_ADVANCE_NO_UNIFY (c
, pend
);
1536 for (p
= str
; p
< pend
; p
++)
1543 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1545 while (charbuf
< charbuf_end
)
1547 ASSURE_DESTINATION (safe_room
);
1549 if (CHAR_BYTE8_P (c
))
1550 *dst
++ = CHAR_TO_BYTE8 (c
);
1552 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
1554 produced_chars
= dst
- (coding
->destination
+ coding
->produced
);
1556 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1557 coding
->produced_char
+= produced_chars
;
1558 coding
->produced
= dst
- coding
->destination
;
1563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1564 Return true if a text is encoded in one of UTF-16 based coding systems. */
1566 #define UTF_16_HIGH_SURROGATE_P(val) \
1567 (((val) & 0xFC00) == 0xD800)
1569 #define UTF_16_LOW_SURROGATE_P(val) \
1570 (((val) & 0xFC00) == 0xDC00)
1574 detect_coding_utf_16 (struct coding_system
*coding
,
1575 struct coding_detection_info
*detect_info
)
1577 const unsigned char *src
= coding
->source
;
1578 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1579 bool multibytep
= coding
->src_multibyte
;
1582 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1583 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1584 && (coding
->src_chars
& 1))
1586 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1590 TWO_MORE_BYTES (c1
, c2
);
1591 if ((c1
== 0xFF) && (c2
== 0xFE))
1593 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1594 | CATEGORY_MASK_UTF_16_AUTO
);
1595 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1596 | CATEGORY_MASK_UTF_16_BE_NOSIG
1597 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1599 else if ((c1
== 0xFE) && (c2
== 0xFF))
1601 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1602 | CATEGORY_MASK_UTF_16_AUTO
);
1603 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1604 | CATEGORY_MASK_UTF_16_BE_NOSIG
1605 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1609 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1614 /* We check the dispersion of Eth and Oth bytes where E is even and
1615 O is odd. If both are high, we assume binary data.*/
1616 unsigned char e
[256], o
[256];
1617 unsigned e_num
= 1, o_num
= 1;
1624 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_AUTO
1625 |CATEGORY_MASK_UTF_16_BE
1626 | CATEGORY_MASK_UTF_16_LE
);
1628 while ((detect_info
->rejected
& CATEGORY_MASK_UTF_16
)
1629 != CATEGORY_MASK_UTF_16
)
1631 TWO_MORE_BYTES (c1
, c2
);
1639 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1646 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1657 decode_coding_utf_16 (struct coding_system
*coding
)
1659 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1660 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1661 const unsigned char *src_base
;
1662 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1663 /* We may produces at most 3 chars in one loop. */
1664 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
- 2;
1665 ptrdiff_t consumed_chars
= 0, consumed_chars_base
= 0;
1666 bool multibytep
= coding
->src_multibyte
;
1667 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1668 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1669 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1671 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1672 int byte_after_cr1
= -1, byte_after_cr2
= -1;
1674 if (bom
== utf_with_bom
)
1683 if (endian
== utf_16_big_endian
1684 ? c
!= 0xFEFF : c
!= 0xFFFE)
1686 /* The first two bytes are not BOM. Treat them as bytes
1687 for a normal character. */
1691 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1693 else if (bom
== utf_detect_bom
)
1695 /* We have already tried to detect BOM and failed in
1697 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1705 consumed_chars_base
= consumed_chars
;
1707 if (charbuf
>= charbuf_end
)
1709 if (byte_after_cr1
>= 0)
1714 if (byte_after_cr1
>= 0)
1715 c1
= byte_after_cr1
, byte_after_cr1
= -1;
1723 if (byte_after_cr2
>= 0)
1724 c2
= byte_after_cr2
, byte_after_cr2
= -1;
1729 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1733 c
= (endian
== utf_16_big_endian
1734 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1738 if (! UTF_16_LOW_SURROGATE_P (c
))
1740 if (endian
== utf_16_big_endian
)
1741 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1743 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1747 if (UTF_16_HIGH_SURROGATE_P (c
))
1748 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1754 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1755 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1756 *charbuf
++ = 0x10000 + c
;
1761 if (UTF_16_HIGH_SURROGATE_P (c
))
1762 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1765 if (eol_dos
&& c
== '\r')
1767 ONE_MORE_BYTE (byte_after_cr1
);
1768 ONE_MORE_BYTE (byte_after_cr2
);
1776 coding
->consumed_char
+= consumed_chars_base
;
1777 coding
->consumed
= src_base
- coding
->source
;
1778 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1782 encode_coding_utf_16 (struct coding_system
*coding
)
1784 bool multibytep
= coding
->dst_multibyte
;
1785 int *charbuf
= coding
->charbuf
;
1786 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1787 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1788 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1790 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1791 bool big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1792 ptrdiff_t produced_chars
= 0;
1795 if (bom
!= utf_without_bom
)
1797 ASSURE_DESTINATION (safe_room
);
1799 EMIT_TWO_BYTES (0xFE, 0xFF);
1801 EMIT_TWO_BYTES (0xFF, 0xFE);
1802 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1805 while (charbuf
< charbuf_end
)
1807 ASSURE_DESTINATION (safe_room
);
1809 if (c
> MAX_UNICODE_CHAR
)
1810 c
= coding
->default_char
;
1815 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1817 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1824 c1
= (c
>> 10) + 0xD800;
1825 c2
= (c
& 0x3FF) + 0xDC00;
1827 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1829 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1832 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1833 coding
->produced
= dst
- coding
->destination
;
1834 coding
->produced_char
+= produced_chars
;
1839 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1841 /* Emacs' internal format for representation of multiple character
1842 sets is a kind of multi-byte encoding, i.e. characters are
1843 represented by variable-length sequences of one-byte codes.
1845 ASCII characters and control characters (e.g. `tab', `newline') are
1846 represented by one-byte sequences which are their ASCII codes, in
1847 the range 0x00 through 0x7F.
1849 8-bit characters of the range 0x80..0x9F are represented by
1850 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1853 8-bit characters of the range 0xA0..0xFF are represented by
1854 one-byte sequences which are their 8-bit code.
1856 The other characters are represented by a sequence of `base
1857 leading-code', optional `extended leading-code', and one or two
1858 `position-code's. The length of the sequence is determined by the
1859 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1860 whereas extended leading-code and position-code take the range 0xA0
1861 through 0xFF. See `charset.h' for more details about leading-code
1864 --- CODE RANGE of Emacs' internal format ---
1868 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1869 eight-bit-graphic 0xA0..0xBF
1870 ELSE 0x81..0x9D + [0xA0..0xFF]+
1871 ---------------------------------------------
1873 As this is the internal character representation, the format is
1874 usually not used externally (i.e. in a file or in a data sent to a
1875 process). But, it is possible to have a text externally in this
1876 format (i.e. by encoding by the coding system `emacs-mule').
1878 In that case, a sequence of one-byte codes has a slightly different
1881 At first, all characters in eight-bit-control are represented by
1882 one-byte sequences which are their 8-bit code.
1884 Next, character composition data are represented by the byte
1885 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1887 METHOD is 0xF2 plus one of composition method (enum
1888 composition_method),
1890 BYTES is 0xA0 plus a byte length of this composition data,
1892 CHARS is 0xA0 plus a number of characters composed by this
1895 COMPONENTs are characters of multibyte form or composition
1896 rules encoded by two-byte of ASCII codes.
1898 In addition, for backward compatibility, the following formats are
1899 also recognized as composition data on decoding.
1902 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1905 MSEQ is a multibyte form but in these special format:
1906 ASCII: 0xA0 ASCII_CODE+0x80,
1907 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1908 RULE is a one byte code of the range 0xA0..0xF0 that
1909 represents a composition rule.
1912 char emacs_mule_bytes
[256];
1915 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1916 Return true if a text is encoded in 'emacs-mule'. */
1919 detect_coding_emacs_mule (struct coding_system
*coding
,
1920 struct coding_detection_info
*detect_info
)
1922 const unsigned char *src
= coding
->source
, *src_base
;
1923 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1924 bool multibytep
= coding
->src_multibyte
;
1925 ptrdiff_t consumed_chars
= 0;
1929 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1930 /* A coding system of this category is always ASCII compatible. */
1931 src
+= coding
->head_ascii
;
1941 /* Perhaps the start of composite character. We simply skip
1942 it because analyzing it is too heavy for detecting. But,
1943 at least, we check that the composite character
1944 constitutes of more than 4 bytes. */
1945 const unsigned char *src_start
;
1955 if (src
- src_start
<= 4)
1957 found
= CATEGORY_MASK_EMACS_MULE
;
1965 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1970 int more_bytes
= emacs_mule_bytes
[c
] - 1;
1972 while (more_bytes
> 0)
1977 src
--; /* Unread the last byte. */
1982 if (more_bytes
!= 0)
1984 found
= CATEGORY_MASK_EMACS_MULE
;
1987 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1991 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1993 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1996 detect_info
->found
|= found
;
2001 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2002 character. If CMP_STATUS indicates that we must expect MSEQ or
2003 RULE described above, decode it and return the negative value of
2004 the decoded character or rule. If an invalid byte is found, return
2005 -1. If SRC is too short, return -2. */
2008 emacs_mule_char (struct coding_system
*coding
, const unsigned char *src
,
2009 int *nbytes
, int *nchars
, int *id
,
2010 struct composition_status
*cmp_status
)
2012 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2013 const unsigned char *src_base
= src
;
2014 bool multibytep
= coding
->src_multibyte
;
2018 ptrdiff_t consumed_chars
= 0;
2019 bool mseq_found
= 0;
2025 charset_ID
= emacs_mule_charset
[0];
2031 if (cmp_status
->state
!= COMPOSING_NO
2032 && cmp_status
->old_form
)
2034 if (cmp_status
->state
== COMPOSING_CHAR
)
2049 *nbytes
= src
- src_base
;
2050 *nchars
= consumed_chars
;
2058 switch (emacs_mule_bytes
[c
])
2061 if ((charset_ID
= emacs_mule_charset
[c
]) < 0)
2070 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
2071 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
2074 if (c
< 0xA0 || (charset_ID
= emacs_mule_charset
[c
]) < 0)
2083 if ((charset_ID
= emacs_mule_charset
[c
]) < 0)
2088 code
= (c
& 0x7F) << 8;
2098 if (c
< 0 || (charset_ID
= emacs_mule_charset
[c
]) < 0)
2103 code
= (c
& 0x7F) << 8;
2112 charset_ID
= ASCII_BYTE_P (code
) ? charset_ascii
: charset_eight_bit
;
2118 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
2119 CHARSET_FROM_ID (charset_ID
), code
, c
);
2123 *nbytes
= src
- src_base
;
2124 *nchars
= consumed_chars
;
2127 return (mseq_found
? -c
: c
);
2137 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2139 /* Handle these composition sequence ('|': the end of header elements,
2140 BYTES and CHARS >= 0xA0):
2142 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2143 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2144 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2148 (4) relative composition: 0x80 | MSEQ ... MSEQ
2149 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2151 When the starter 0x80 and the following header elements are found,
2152 this annotation header is produced.
2154 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2156 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2159 Then, upon reading the following elements, these codes are produced
2160 until the composition end is found:
2163 (2) ALT ... ALT CHAR ... CHAR
2164 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2166 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2168 When the composition end is found, LENGTH and NCHARS in the
2169 annotation header is updated as below:
2171 (1) LENGTH: unchanged, NCHARS: unchanged
2172 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2174 (4) LENGTH: unchanged, NCHARS: number of CHARs
2175 (5) LENGTH: unchanged, NCHARS: number of CHARs
2177 If an error is found while composing, the annotation header is
2178 changed to the original composition header (plus filler -1s) as
2181 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2182 (5) [ 0x80 0xFF -1 -1- -1 ]
2184 and the sequence [ -2 DECODED-RULE ] is changed to the original
2185 byte sequence as below:
2186 o the original byte sequence is B: [ B -1 ]
2187 o the original byte sequence is B1 B2: [ B1 B2 ]
2189 Most of the routines are implemented by macros because many
2190 variables and labels in the caller decode_coding_emacs_mule must be
2191 accessible, and they are usually called just once (thus doesn't
2192 increase the size of compiled object). */
2194 /* Decode a composition rule represented by C as a component of
2195 composition sequence of Emacs 20 style. Set RULE to the decoded
2198 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2203 if (c < 0 || c >= 81) \
2204 goto invalid_code; \
2205 gref = c / 9, nref = c % 9; \
2206 if (gref == 4) gref = 10; \
2207 if (nref == 4) nref = 10; \
2208 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2212 /* Decode a composition rule represented by C and the following byte
2213 at SRC as a component of composition sequence of Emacs 21 style.
2214 Set RULE to the decoded rule. */
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2221 if (gref < 0 || gref >= 81) \
2222 goto invalid_code; \
2223 ONE_MORE_BYTE (c); \
2225 if (nref < 0 || nref >= 81) \
2226 goto invalid_code; \
2227 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2231 /* Start of Emacs 21 style format. The first three bytes at SRC are
2232 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2233 byte length of this composition information, CHARS is the number of
2234 characters composed by this composition. */
2236 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2238 enum composition_method method = c - 0xF2; \
2239 int nbytes, nchars; \
2241 ONE_MORE_BYTE (c); \
2243 goto invalid_code; \
2244 nbytes = c - 0xA0; \
2245 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2246 goto invalid_code; \
2247 ONE_MORE_BYTE (c); \
2248 nchars = c - 0xA0; \
2249 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2250 goto invalid_code; \
2251 cmp_status->old_form = 0; \
2252 cmp_status->method = method; \
2253 if (method == COMPOSITION_RELATIVE) \
2254 cmp_status->state = COMPOSING_CHAR; \
2256 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2257 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2258 cmp_status->nchars = nchars; \
2259 cmp_status->ncomps = nbytes - 4; \
2260 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2264 /* Start of Emacs 20 style format for relative composition. */
2266 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2268 cmp_status->old_form = 1; \
2269 cmp_status->method = COMPOSITION_RELATIVE; \
2270 cmp_status->state = COMPOSING_CHAR; \
2271 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2272 cmp_status->nchars = cmp_status->ncomps = 0; \
2273 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2277 /* Start of Emacs 20 style format for rule-base composition. */
2279 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2281 cmp_status->old_form = 1; \
2282 cmp_status->method = COMPOSITION_WITH_RULE; \
2283 cmp_status->state = COMPOSING_CHAR; \
2284 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2285 cmp_status->nchars = cmp_status->ncomps = 0; \
2286 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2290 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2292 const unsigned char *current_src = src; \
2294 ONE_MORE_BYTE (c); \
2296 goto invalid_code; \
2297 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2298 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2299 DECODE_EMACS_MULE_21_COMPOSITION (); \
2300 else if (c < 0xA0) \
2301 goto invalid_code; \
2302 else if (c < 0xC0) \
2304 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2305 /* Re-read C as a composition component. */ \
2306 src = current_src; \
2308 else if (c == 0xFF) \
2309 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2311 goto invalid_code; \
2314 #define EMACS_MULE_COMPOSITION_END() \
2316 int idx = - cmp_status->length; \
2318 if (cmp_status->old_form) \
2319 charbuf[idx + 2] = cmp_status->nchars; \
2320 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2321 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2322 cmp_status->state = COMPOSING_NO; \
2327 emacs_mule_finish_composition (int *charbuf
,
2328 struct composition_status
*cmp_status
)
2330 int idx
= - cmp_status
->length
;
2333 if (cmp_status
->old_form
&& cmp_status
->nchars
> 0)
2335 charbuf
[idx
+ 2] = cmp_status
->nchars
;
2337 if (cmp_status
->method
== COMPOSITION_WITH_RULE
2338 && cmp_status
->state
== COMPOSING_CHAR
)
2340 /* The last rule was invalid. */
2341 int rule
= charbuf
[-1] + 0xA0;
2343 charbuf
[-2] = BYTE8_TO_CHAR (rule
);
2350 charbuf
[idx
++] = BYTE8_TO_CHAR (0x80);
2352 if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2354 charbuf
[idx
++] = BYTE8_TO_CHAR (0xFF);
2355 charbuf
[idx
++] = -3;
2361 int nchars
= charbuf
[idx
+ 1] + 0xA0;
2362 int nbytes
= charbuf
[idx
+ 2] + 0xA0;
2364 charbuf
[idx
++] = BYTE8_TO_CHAR (0xF2 + cmp_status
->method
);
2365 charbuf
[idx
++] = BYTE8_TO_CHAR (nbytes
);
2366 charbuf
[idx
++] = BYTE8_TO_CHAR (nchars
);
2367 charbuf
[idx
++] = -1;
2371 cmp_status
->state
= COMPOSING_NO
;
2375 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2377 if (cmp_status->state != COMPOSING_NO) \
2378 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2383 decode_coding_emacs_mule (struct coding_system
*coding
)
2385 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2386 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2387 const unsigned char *src_base
;
2388 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
2389 /* We may produce two annotations (charset and composition) in one
2390 loop and one more charset annotation at the end. */
2392 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3)
2393 /* We can produce up to 2 characters in a loop. */
2395 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
2396 bool multibytep
= coding
->src_multibyte
;
2397 ptrdiff_t char_offset
= coding
->produced_char
;
2398 ptrdiff_t last_offset
= char_offset
;
2399 int last_id
= charset_ascii
;
2401 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
2402 int byte_after_cr
= -1;
2403 struct composition_status
*cmp_status
= &coding
->spec
.emacs_mule
.cmp_status
;
2405 if (cmp_status
->state
!= COMPOSING_NO
)
2409 if (charbuf_end
- charbuf
< cmp_status
->length
)
2411 for (i
= 0; i
< cmp_status
->length
; i
++)
2412 *charbuf
++ = cmp_status
->carryover
[i
];
2413 coding
->annotated
= 1;
2418 int c
, id
IF_LINT (= 0);
2421 consumed_chars_base
= consumed_chars
;
2423 if (charbuf
>= charbuf_end
)
2425 if (byte_after_cr
>= 0)
2430 if (byte_after_cr
>= 0)
2431 c
= byte_after_cr
, byte_after_cr
= -1;
2435 if (c
< 0 || c
== 0x80)
2437 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2444 DECODE_EMACS_MULE_COMPOSITION_START ();
2450 if (eol_dos
&& c
== '\r')
2451 ONE_MORE_BYTE (byte_after_cr
);
2453 if (cmp_status
->state
!= COMPOSING_NO
)
2455 if (cmp_status
->old_form
)
2456 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2457 else if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2458 cmp_status
->ncomps
--;
2463 int nchars
IF_LINT (= 0), nbytes
IF_LINT (= 0);
2464 /* emacs_mule_char can load a charset map from a file, which
2465 allocates a large structure and might cause buffer text
2466 to be relocated as result. Thus, we need to remember the
2467 original pointer to buffer text, and fix up all related
2468 pointers after the call. */
2469 const unsigned char *orig
= coding
->source
;
2472 c
= emacs_mule_char (coding
, src_base
, &nbytes
, &nchars
, &id
,
2474 offset
= coding
->source
- orig
;
2488 src
= src_base
+ nbytes
;
2489 consumed_chars
= consumed_chars_base
+ nchars
;
2490 if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2491 cmp_status
->ncomps
-= nchars
;
2494 /* Now if C >= 0, we found a normally encoded character, if C <
2495 0, we found an old-style composition component character or
2498 if (cmp_status
->state
== COMPOSING_NO
)
2502 if (last_id
!= charset_ascii
)
2503 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
,
2506 last_offset
= char_offset
;
2511 else if (cmp_status
->state
== COMPOSING_CHAR
)
2513 if (cmp_status
->old_form
)
2517 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2524 cmp_status
->nchars
++;
2525 cmp_status
->length
++;
2526 if (cmp_status
->nchars
== MAX_COMPOSITION_COMPONENTS
)
2527 EMACS_MULE_COMPOSITION_END ();
2528 else if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2529 cmp_status
->state
= COMPOSING_RULE
;
2535 cmp_status
->length
++;
2536 cmp_status
->nchars
--;
2537 if (cmp_status
->nchars
== 0)
2538 EMACS_MULE_COMPOSITION_END ();
2541 else if (cmp_status
->state
== COMPOSING_RULE
)
2547 EMACS_MULE_COMPOSITION_END ();
2554 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c
, rule
);
2559 cmp_status
->length
+= 2;
2560 cmp_status
->state
= COMPOSING_CHAR
;
2563 else if (cmp_status
->state
== COMPOSING_COMPONENT_CHAR
)
2566 cmp_status
->length
++;
2567 if (cmp_status
->ncomps
== 0)
2568 cmp_status
->state
= COMPOSING_CHAR
;
2569 else if (cmp_status
->ncomps
> 0)
2571 if (cmp_status
->method
== COMPOSITION_WITH_RULE_ALTCHARS
)
2572 cmp_status
->state
= COMPOSING_COMPONENT_RULE
;
2575 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2577 else /* COMPOSING_COMPONENT_RULE */
2581 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c
, rule
);
2586 cmp_status
->length
+= 2;
2587 cmp_status
->ncomps
--;
2588 if (cmp_status
->ncomps
> 0)
2589 cmp_status
->state
= COMPOSING_COMPONENT_CHAR
;
2591 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2598 consumed_chars
= consumed_chars_base
;
2600 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2606 if (cmp_status
->state
!= COMPOSING_NO
)
2608 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2609 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2614 charbuf
-= cmp_status
->length
;
2615 for (i
= 0; i
< cmp_status
->length
; i
++)
2616 cmp_status
->carryover
[i
] = charbuf
[i
];
2619 if (last_id
!= charset_ascii
)
2620 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
2621 coding
->consumed_char
+= consumed_chars_base
;
2622 coding
->consumed
= src_base
- coding
->source
;
2623 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2627 #define EMACS_MULE_LEADING_CODES(id, codes) \
2630 codes[0] = id, codes[1] = 0; \
2631 else if (id < 0xE0) \
2632 codes[0] = 0x9A, codes[1] = id; \
2633 else if (id < 0xF0) \
2634 codes[0] = 0x9B, codes[1] = id; \
2635 else if (id < 0xF5) \
2636 codes[0] = 0x9C, codes[1] = id; \
2638 codes[0] = 0x9D, codes[1] = id; \
2643 encode_coding_emacs_mule (struct coding_system
*coding
)
2645 bool multibytep
= coding
->dst_multibyte
;
2646 int *charbuf
= coding
->charbuf
;
2647 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2648 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2649 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2651 ptrdiff_t produced_chars
= 0;
2652 Lisp_Object attrs
, charset_list
;
2654 int preferred_charset_id
= -1;
2656 CODING_GET_INFO (coding
, attrs
, charset_list
);
2657 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2659 charset_list
= Vemacs_mule_charset_list
;
2660 ASET (attrs
, coding_attr_charset_list
, charset_list
);
2663 while (charbuf
< charbuf_end
)
2665 ASSURE_DESTINATION (safe_room
);
2670 /* Handle an annotation. */
2673 case CODING_ANNOTATE_COMPOSITION_MASK
:
2674 /* Not yet implemented. */
2676 case CODING_ANNOTATE_CHARSET_MASK
:
2677 preferred_charset_id
= charbuf
[3];
2678 if (preferred_charset_id
>= 0
2679 && NILP (Fmemq (make_number (preferred_charset_id
),
2681 preferred_charset_id
= -1;
2690 if (ASCII_CHAR_P (c
))
2691 EMIT_ONE_ASCII_BYTE (c
);
2692 else if (CHAR_BYTE8_P (c
))
2694 c
= CHAR_TO_BYTE8 (c
);
2699 struct charset
*charset
;
2703 unsigned char leading_codes
[2];
2705 if (preferred_charset_id
>= 0)
2709 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2710 CODING_CHAR_CHARSET_P (coding
, dst
, dst_end
, c
, charset
, result
);
2712 code
= ENCODE_CHAR (charset
, c
);
2714 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2718 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2722 c
= coding
->default_char
;
2723 if (ASCII_CHAR_P (c
))
2725 EMIT_ONE_ASCII_BYTE (c
);
2728 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
2731 dimension
= CHARSET_DIMENSION (charset
);
2732 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2733 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2734 EMIT_ONE_BYTE (leading_codes
[0]);
2735 if (leading_codes
[1])
2736 EMIT_ONE_BYTE (leading_codes
[1]);
2738 EMIT_ONE_BYTE (code
| 0x80);
2742 EMIT_ONE_BYTE (code
>> 8);
2743 EMIT_ONE_BYTE (code
& 0xFF);
2747 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2748 coding
->produced_char
+= produced_chars
;
2749 coding
->produced
= dst
- coding
->destination
;
2754 /*** 7. ISO2022 handlers ***/
2756 /* The following note describes the coding system ISO2022 briefly.
2757 Since the intention of this note is to help understand the
2758 functions in this file, some parts are NOT ACCURATE or are OVERLY
2759 SIMPLIFIED. For thorough understanding, please refer to the
2760 original document of ISO2022. This is equivalent to the standard
2761 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2763 ISO2022 provides many mechanisms to encode several character sets
2764 in 7-bit and 8-bit environments. For 7-bit environments, all text
2765 is encoded using bytes less than 128. This may make the encoded
2766 text a little bit longer, but the text passes more easily through
2767 several types of gateway, some of which strip off the MSB (Most
2770 There are two kinds of character sets: control character sets and
2771 graphic character sets. The former contain control characters such
2772 as `newline' and `escape' to provide control functions (control
2773 functions are also provided by escape sequences). The latter
2774 contain graphic characters such as 'A' and '-'. Emacs recognizes
2775 two control character sets and many graphic character sets.
2777 Graphic character sets are classified into one of the following
2778 four classes, according to the number of bytes (DIMENSION) and
2779 number of characters in one dimension (CHARS) of the set:
2780 - DIMENSION1_CHARS94
2781 - DIMENSION1_CHARS96
2782 - DIMENSION2_CHARS94
2783 - DIMENSION2_CHARS96
2785 In addition, each character set is assigned an identification tag,
2786 unique for each set, called the "final character" (denoted as <F>
2787 hereafter). The <F> of each character set is decided by ECMA(*)
2788 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2789 (0x30..0x3F are for private use only).
2791 Note (*): ECMA = European Computer Manufacturers Association
2793 Here are examples of graphic character sets [NAME(<F>)]:
2794 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2795 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2796 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2797 o DIMENSION2_CHARS96 -- none for the moment
2799 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2800 C0 [0x00..0x1F] -- control character plane 0
2801 GL [0x20..0x7F] -- graphic character plane 0
2802 C1 [0x80..0x9F] -- control character plane 1
2803 GR [0xA0..0xFF] -- graphic character plane 1
2805 A control character set is directly designated and invoked to C0 or
2806 C1 by an escape sequence. The most common case is that:
2807 - ISO646's control character set is designated/invoked to C0, and
2808 - ISO6429's control character set is designated/invoked to C1,
2809 and usually these designations/invocations are omitted in encoded
2810 text. In a 7-bit environment, only C0 can be used, and a control
2811 character for C1 is encoded by an appropriate escape sequence to
2812 fit into the environment. All control characters for C1 are
2813 defined to have corresponding escape sequences.
2815 A graphic character set is at first designated to one of four
2816 graphic registers (G0 through G3), then these graphic registers are
2817 invoked to GL or GR. These designations and invocations can be
2818 done independently. The most common case is that G0 is invoked to
2819 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2820 these invocations and designations are omitted in encoded text.
2821 In a 7-bit environment, only GL can be used.
2823 When a graphic character set of CHARS94 is invoked to GL, codes
2824 0x20 and 0x7F of the GL area work as control characters SPACE and
2825 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2828 There are two ways of invocation: locking-shift and single-shift.
2829 With locking-shift, the invocation lasts until the next different
2830 invocation, whereas with single-shift, the invocation affects the
2831 following character only and doesn't affect the locking-shift
2832 state. Invocations are done by the following control characters or
2835 ----------------------------------------------------------------------
2836 abbrev function cntrl escape seq description
2837 ----------------------------------------------------------------------
2838 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2839 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2840 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2841 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2842 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2843 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2844 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2845 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2846 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2847 ----------------------------------------------------------------------
2848 (*) These are not used by any known coding system.
2850 Control characters for these functions are defined by macros
2851 ISO_CODE_XXX in `coding.h'.
2853 Designations are done by the following escape sequences:
2854 ----------------------------------------------------------------------
2855 escape sequence description
2856 ----------------------------------------------------------------------
2857 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2858 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2859 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2860 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2861 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2862 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2863 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2864 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2865 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2866 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2867 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2868 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2869 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2870 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2871 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2872 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2873 ----------------------------------------------------------------------
2875 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2876 of dimension 1, chars 94, and final character <F>, etc...
2878 Note (*): Although these designations are not allowed in ISO2022,
2879 Emacs accepts them on decoding, and produces them on encoding
2880 CHARS96 character sets in a coding system which is characterized as
2881 7-bit environment, non-locking-shift, and non-single-shift.
2883 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2884 '(' must be omitted. We refer to this as "short-form" hereafter.
2886 Now you may notice that there are a lot of ways of encoding the
2887 same multilingual text in ISO2022. Actually, there exist many
2888 coding systems such as Compound Text (used in X11's inter client
2889 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2890 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2891 localized platforms), and all of these are variants of ISO2022.
2893 In addition to the above, Emacs handles two more kinds of escape
2894 sequences: ISO6429's direction specification and Emacs' private
2895 sequence for specifying character composition.
2897 ISO6429's direction specification takes the following form:
2898 o CSI ']' -- end of the current direction
2899 o CSI '0' ']' -- end of the current direction
2900 o CSI '1' ']' -- start of left-to-right text
2901 o CSI '2' ']' -- start of right-to-left text
2902 The control character CSI (0x9B: control sequence introducer) is
2903 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2905 Character composition specification takes the following form:
2906 o ESC '0' -- start relative composition
2907 o ESC '1' -- end composition
2908 o ESC '2' -- start rule-base composition (*)
2909 o ESC '3' -- start relative composition with alternate chars (**)
2910 o ESC '4' -- start rule-base composition with alternate chars (**)
2911 Since these are not standard escape sequences of any ISO standard,
2912 the use of them with these meanings is restricted to Emacs only.
2914 (*) This form is used only in Emacs 20.7 and older versions,
2915 but newer versions can safely decode it.
2916 (**) This form is used only in Emacs 21.1 and newer versions,
2917 and older versions can't decode it.
2919 Here's a list of example usages of these composition escape
2920 sequences (categorized by `enum composition_method').
2922 COMPOSITION_RELATIVE:
2923 ESC 0 CHAR [ CHAR ] ESC 1
2924 COMPOSITION_WITH_RULE:
2925 ESC 2 CHAR [ RULE CHAR ] ESC 1
2926 COMPOSITION_WITH_ALTCHARS:
2927 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2928 COMPOSITION_WITH_RULE_ALTCHARS:
2929 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2931 static enum iso_code_class_type iso_code_class
[256];
2933 #define SAFE_CHARSET_P(coding, id) \
2934 ((id) <= (coding)->max_charset_id \
2935 && (coding)->safe_charsets[id] != 255)
2938 setup_iso_safe_charsets (Lisp_Object attrs
)
2940 Lisp_Object charset_list
, safe_charsets
;
2941 Lisp_Object request
;
2942 Lisp_Object reg_usage
;
2944 EMACS_INT reg94
, reg96
;
2945 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2948 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2949 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2950 && ! EQ (charset_list
, Viso_2022_charset_list
))
2952 charset_list
= Viso_2022_charset_list
;
2953 ASET (attrs
, coding_attr_charset_list
, charset_list
);
2954 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2957 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2961 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2963 int id
= XINT (XCAR (tail
));
2964 if (max_charset_id
< id
)
2965 max_charset_id
= id
;
2968 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
2969 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
2970 request
= AREF (attrs
, coding_attr_iso_request
);
2971 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2972 reg94
= XINT (XCAR (reg_usage
));
2973 reg96
= XINT (XCDR (reg_usage
));
2975 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2979 struct charset
*charset
;
2982 charset
= CHARSET_FROM_ID (XINT (id
));
2983 reg
= Fcdr (Fassq (id
, request
));
2985 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2986 else if (charset
->iso_chars_96
)
2989 SSET (safe_charsets
, XINT (id
), reg96
);
2994 SSET (safe_charsets
, XINT (id
), reg94
);
2997 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
3001 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3002 Return true if a text is encoded in one of ISO-2022 based coding
3006 detect_coding_iso_2022 (struct coding_system
*coding
,
3007 struct coding_detection_info
*detect_info
)
3009 const unsigned char *src
= coding
->source
, *src_base
= src
;
3010 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3011 bool multibytep
= coding
->src_multibyte
;
3012 bool single_shifting
= 0;
3015 ptrdiff_t consumed_chars
= 0;
3019 int composition_count
= -1;
3021 detect_info
->checked
|= CATEGORY_MASK_ISO
;
3023 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
3025 struct coding_system
*this = &(coding_categories
[i
]);
3026 Lisp_Object attrs
, val
;
3030 attrs
= CODING_ID_ATTRS (this->id
);
3031 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3032 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Viso_2022_charset_list
))
3033 setup_iso_safe_charsets (attrs
);
3034 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
3035 this->max_charset_id
= SCHARS (val
) - 1;
3036 this->safe_charsets
= SDATA (val
);
3039 /* A coding system of this category is always ASCII compatible. */
3040 src
+= coding
->head_ascii
;
3042 while (rejected
!= CATEGORY_MASK_ISO
)
3049 if (inhibit_iso_escape_detection
)
3051 single_shifting
= 0;
3053 if (c
== 'N' || c
== 'O')
3055 /* ESC <Fe> for SS2 or SS3. */
3056 single_shifting
= 1;
3057 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
3061 /* End of composition. */
3062 if (composition_count
< 0
3063 || composition_count
> MAX_COMPOSITION_COMPONENTS
)
3066 composition_count
= -1;
3067 found
|= CATEGORY_MASK_ISO
;
3069 else if (c
>= '0' && c
<= '4')
3071 /* ESC <Fp> for start/end composition. */
3072 composition_count
= 0;
3076 if (c
>= '(' && c
<= '/')
3078 /* Designation sequence for a charset of dimension 1. */
3080 if (c1
< ' ' || c1
>= 0x80
3081 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
3083 /* Invalid designation sequence. Just ignore. */
3085 rejected
|= (CATEGORY_MASK_ISO_7BIT
3086 | CATEGORY_MASK_ISO_7_ELSE
);
3092 /* Designation sequence for a charset of dimension 2. */
3094 if (c
>= '@' && c
<= 'B')
3095 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3096 id
= iso_charset_table
[1][0][c
];
3097 else if (c
>= '(' && c
<= '/')
3100 if (c1
< ' ' || c1
>= 0x80
3101 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
3103 /* Invalid designation sequence. Just ignore. */
3105 rejected
|= (CATEGORY_MASK_ISO_7BIT
3106 | CATEGORY_MASK_ISO_7_ELSE
);
3112 /* Invalid designation sequence. Just ignore it. */
3114 rejected
|= (CATEGORY_MASK_ISO_7BIT
3115 | CATEGORY_MASK_ISO_7_ELSE
);
3121 /* Invalid escape sequence. Just ignore it. */
3123 rejected
|= (CATEGORY_MASK_ISO_7BIT
3124 | CATEGORY_MASK_ISO_7_ELSE
);
3128 /* We found a valid designation sequence for CHARSET. */
3129 rejected
|= CATEGORY_MASK_ISO_8BIT
;
3130 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
3132 found
|= CATEGORY_MASK_ISO_7
;
3134 rejected
|= CATEGORY_MASK_ISO_7
;
3135 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
3137 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
3139 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
3140 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
3142 found
|= CATEGORY_MASK_ISO_7_ELSE
;
3144 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
3145 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
3147 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3149 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
3155 /* Locking shift out/in. */
3156 if (inhibit_iso_escape_detection
)
3158 single_shifting
= 0;
3159 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
3163 /* Control sequence introducer. */
3164 single_shifting
= 0;
3165 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3166 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3167 goto check_extra_latin
;
3172 if (inhibit_iso_escape_detection
)
3174 single_shifting
= 0;
3175 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3176 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3177 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3179 found
|= CATEGORY_MASK_ISO_8_1
;
3180 single_shifting
= 1;
3182 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
3183 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3185 found
|= CATEGORY_MASK_ISO_8_2
;
3186 single_shifting
= 1;
3188 if (single_shifting
)
3190 goto check_extra_latin
;
3197 if (composition_count
>= 0)
3198 composition_count
++;
3199 single_shifting
= 0;
3202 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3205 found
|= CATEGORY_MASK_ISO_8_1
;
3206 /* Check the length of succeeding codes of the range
3207 0xA0..0FF. If the byte length is even, we include
3208 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3209 only when we are not single shifting. */
3210 if (! single_shifting
3211 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
3214 while (src
< src_end
)
3226 if (len
& 1 && src
< src_end
)
3228 rejected
|= CATEGORY_MASK_ISO_8_2
;
3229 if (composition_count
>= 0)
3230 composition_count
+= len
;
3234 found
|= CATEGORY_MASK_ISO_8_2
;
3235 if (composition_count
>= 0)
3236 composition_count
+= len
/ 2;
3242 if (! VECTORP (Vlatin_extra_code_table
)
3243 || NILP (AREF (Vlatin_extra_code_table
, c
)))
3245 rejected
= CATEGORY_MASK_ISO
;
3248 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3249 & CODING_ISO_FLAG_LATIN_EXTRA
)
3250 found
|= CATEGORY_MASK_ISO_8_1
;
3252 rejected
|= CATEGORY_MASK_ISO_8_1
;
3253 rejected
|= CATEGORY_MASK_ISO_8_2
;
3257 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
3261 detect_info
->rejected
|= rejected
;
3262 detect_info
->found
|= (found
& ~rejected
);
3267 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3268 escape sequence should be kept. */
3269 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3273 if (final < '0' || final >= 128 \
3274 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3275 || !SAFE_CHARSET_P (coding, id)) \
3277 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3281 prev = CODING_ISO_DESIGNATION (coding, reg); \
3282 if (id == charset_jisx0201_roman) \
3284 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3285 id = charset_ascii; \
3287 else if (id == charset_jisx0208_1978) \
3289 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3290 id = charset_jisx0208; \
3292 CODING_ISO_DESIGNATION (coding, reg) = id; \
3293 /* If there was an invalid designation to REG previously, and this \
3294 designation is ASCII to REG, we should keep this designation \
3296 if (prev == -2 && id == charset_ascii) \
3301 /* Handle these composition sequence (ALT: alternate char):
3303 (1) relative composition: ESC 0 CHAR ... ESC 1
3304 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3305 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3306 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3308 When the start sequence (ESC 0/2/3/4) is found, this annotation
3311 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3313 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3314 produced until the end sequence (ESC 1) is found:
3317 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3318 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3319 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3321 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3322 annotation header is updated as below:
3324 (1) LENGTH: unchanged, NCHARS: number of CHARs
3325 (2) LENGTH: unchanged, NCHARS: number of CHARs
3326 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3327 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3329 If an error is found while composing, the annotation header is
3332 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3334 and the sequence [ -2 DECODED-RULE ] is changed to the original
3335 byte sequence as below:
3336 o the original byte sequence is B: [ B -1 ]
3337 o the original byte sequence is B1 B2: [ B1 B2 ]
3338 and the sequence [ -1 -1 ] is changed to the original byte
3343 /* Decode a composition rule C1 and maybe one more byte from the
3344 source, and set RULE to the encoded composition rule. If the rule
3345 is invalid, goto invalid_code. */
3347 #define DECODE_COMPOSITION_RULE(rule) \
3351 goto invalid_code; \
3352 if (rule < 81) /* old format (before ver.21) */ \
3354 int gref = (rule) / 9; \
3355 int nref = (rule) % 9; \
3356 if (gref == 4) gref = 10; \
3357 if (nref == 4) nref = 10; \
3358 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3360 else /* new format (after ver.21) */ \
3364 ONE_MORE_BYTE (b); \
3365 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3366 goto invalid_code; \
3367 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3368 rule += 0x100; /* Distinguish it from the old format. */ \
3372 #define ENCODE_COMPOSITION_RULE(rule) \
3374 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3376 if (rule < 0x100) /* old format */ \
3378 if (gref == 10) gref = 4; \
3379 if (nref == 10) nref = 4; \
3380 charbuf[idx] = 32 + gref * 9 + nref; \
3381 charbuf[idx + 1] = -1; \
3384 else /* new format */ \
3386 charbuf[idx] = 32 + 81 + gref; \
3387 charbuf[idx + 1] = 32 + nref; \
3392 /* Finish the current composition as invalid. */
3395 finish_composition (int *charbuf
, struct composition_status
*cmp_status
)
3397 int idx
= - cmp_status
->length
;
3400 /* Recover the original ESC sequence */
3401 charbuf
[idx
++] = ISO_CODE_ESC
;
3402 charbuf
[idx
++] = (cmp_status
->method
== COMPOSITION_RELATIVE
? '0'
3403 : cmp_status
->method
== COMPOSITION_WITH_RULE
? '2'
3404 : cmp_status
->method
== COMPOSITION_WITH_ALTCHARS
? '3'
3405 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3407 charbuf
[idx
++] = -2;
3409 charbuf
[idx
++] = -1;
3410 new_chars
= cmp_status
->nchars
;
3411 if (cmp_status
->method
>= COMPOSITION_WITH_RULE
)
3412 for (; idx
< 0; idx
++)
3414 int elt
= charbuf
[idx
];
3418 ENCODE_COMPOSITION_RULE (charbuf
[idx
+ 1]);
3423 charbuf
[idx
++] = ISO_CODE_ESC
;
3428 cmp_status
->state
= COMPOSING_NO
;
3432 /* If characters are under composition, finish the composition. */
3433 #define MAYBE_FINISH_COMPOSITION() \
3435 if (cmp_status->state != COMPOSING_NO) \
3436 char_offset += finish_composition (charbuf, cmp_status); \
3439 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3441 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3442 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3443 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3444 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3446 Produce this annotation sequence now:
3448 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3451 #define DECODE_COMPOSITION_START(c1) \
3454 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3455 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3456 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3457 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3461 cmp_status->state = COMPOSING_CHAR; \
3462 cmp_status->length += 2; \
3466 MAYBE_FINISH_COMPOSITION (); \
3467 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3468 : c1 == '2' ? COMPOSITION_WITH_RULE \
3469 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3470 : COMPOSITION_WITH_RULE_ALTCHARS); \
3472 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3473 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3474 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3475 cmp_status->nchars = cmp_status->ncomps = 0; \
3476 coding->annotated = 1; \
3481 /* Handle composition end sequence ESC 1. */
3483 #define DECODE_COMPOSITION_END() \
3485 if (cmp_status->nchars == 0 \
3486 || ((cmp_status->state == COMPOSING_CHAR) \
3487 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3489 MAYBE_FINISH_COMPOSITION (); \
3490 goto invalid_code; \
3492 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3493 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3494 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3495 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3496 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3497 char_offset += cmp_status->nchars; \
3498 cmp_status->state = COMPOSING_NO; \
3501 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3503 #define STORE_COMPOSITION_RULE(rule) \
3506 *charbuf++ = rule; \
3507 cmp_status->length += 2; \
3508 cmp_status->state--; \
3511 /* Store a composed char or a component char C in charbuf, and update
3514 #define STORE_COMPOSITION_CHAR(c) \
3517 cmp_status->length++; \
3518 if (cmp_status->state == COMPOSING_CHAR) \
3519 cmp_status->nchars++; \
3521 cmp_status->ncomps++; \
3522 if (cmp_status->method == COMPOSITION_WITH_RULE \
3523 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3524 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3525 cmp_status->state++; \
3529 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3532 decode_coding_iso_2022 (struct coding_system
*coding
)
3534 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3535 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3536 const unsigned char *src_base
;
3537 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
3538 /* We may produce two annotations (charset and composition) in one
3539 loop and one more charset annotation at the end. */
3541 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3);
3542 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
3543 bool multibytep
= coding
->src_multibyte
;
3544 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3545 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3546 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3547 int charset_id_2
, charset_id_3
;
3548 struct charset
*charset
;
3550 struct composition_status
*cmp_status
= CODING_ISO_CMP_STATUS (coding
);
3551 Lisp_Object attrs
= CODING_ID_ATTRS (coding
->id
);
3552 ptrdiff_t char_offset
= coding
->produced_char
;
3553 ptrdiff_t last_offset
= char_offset
;
3554 int last_id
= charset_ascii
;
3556 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
3557 int byte_after_cr
= -1;
3560 setup_iso_safe_charsets (attrs
);
3561 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
3563 if (cmp_status
->state
!= COMPOSING_NO
)
3565 if (charbuf_end
- charbuf
< cmp_status
->length
)
3567 for (i
= 0; i
< cmp_status
->length
; i
++)
3568 *charbuf
++ = cmp_status
->carryover
[i
];
3569 coding
->annotated
= 1;
3577 consumed_chars_base
= consumed_chars
;
3579 if (charbuf
>= charbuf_end
)
3581 if (byte_after_cr
>= 0)
3586 if (byte_after_cr
>= 0)
3587 c1
= byte_after_cr
, byte_after_cr
= -1;
3593 if (CODING_ISO_EXTSEGMENT_LEN (coding
) > 0)
3595 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3597 CODING_ISO_EXTSEGMENT_LEN (coding
)--;
3601 if (CODING_ISO_EMBEDDED_UTF_8 (coding
))
3603 if (c1
== ISO_CODE_ESC
)
3605 if (src
+ 1 >= src_end
)
3606 goto no_more_source
;
3607 *charbuf
++ = ISO_CODE_ESC
;
3609 if (src
[0] == '%' && src
[1] == '@')
3612 consumed_chars
+= 2;
3614 /* We are sure charbuf can contain two more chars. */
3617 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
3622 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3628 if ((cmp_status
->state
== COMPOSING_RULE
3629 || cmp_status
->state
== COMPOSING_COMPONENT_RULE
)
3630 && c1
!= ISO_CODE_ESC
)
3634 DECODE_COMPOSITION_RULE (rule
);
3635 STORE_COMPOSITION_RULE (rule
);
3639 /* We produce at most one character. */
3640 switch (iso_code_class
[c1
])
3642 case ISO_0x20_or_0x7F
:
3643 if (charset_id_0
< 0
3644 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
3645 /* This is SPACE or DEL. */
3646 charset
= CHARSET_FROM_ID (charset_ascii
);
3648 charset
= CHARSET_FROM_ID (charset_id_0
);
3651 case ISO_graphic_plane_0
:
3652 if (charset_id_0
< 0)
3653 charset
= CHARSET_FROM_ID (charset_ascii
);
3655 charset
= CHARSET_FROM_ID (charset_id_0
);
3658 case ISO_0xA0_or_0xFF
:
3659 if (charset_id_1
< 0
3660 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
3661 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3663 /* This is a graphic character, we fall down ... */
3665 case ISO_graphic_plane_1
:
3666 if (charset_id_1
< 0)
3668 charset
= CHARSET_FROM_ID (charset_id_1
);
3672 if (eol_dos
&& c1
== '\r')
3673 ONE_MORE_BYTE (byte_after_cr
);
3674 MAYBE_FINISH_COMPOSITION ();
3675 charset
= CHARSET_FROM_ID (charset_ascii
);
3682 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3683 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
3685 CODING_ISO_INVOCATION (coding
, 0) = 1;
3686 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3690 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
3692 CODING_ISO_INVOCATION (coding
, 0) = 0;
3693 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3696 case ISO_single_shift_2_7
:
3697 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
))
3699 case ISO_single_shift_2
:
3700 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3702 /* SS2 is handled as an escape sequence of ESC 'N' */
3704 goto label_escape_sequence
;
3706 case ISO_single_shift_3
:
3707 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3709 /* SS2 is handled as an escape sequence of ESC 'O' */
3711 goto label_escape_sequence
;
3713 case ISO_control_sequence_introducer
:
3714 /* CSI is handled as an escape sequence of ESC '[' ... */
3716 goto label_escape_sequence
;
3720 label_escape_sequence
:
3721 /* Escape sequences handled here are invocation,
3722 designation, direction specification, and character
3723 composition specification. */
3726 case '&': /* revision of following character set */
3728 if (!(c1
>= '@' && c1
<= '~'))
3731 if (c1
!= ISO_CODE_ESC
)
3734 goto label_escape_sequence
;
3736 case '$': /* designation of 2-byte character set */
3737 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3743 if (c1
>= '@' && c1
<= 'B')
3744 { /* designation of JISX0208.1978, GB2312.1980,
3746 reg
= 0, chars96
= 0;
3748 else if (c1
>= 0x28 && c1
<= 0x2B)
3749 { /* designation of DIMENSION2_CHARS94 character set */
3750 reg
= c1
- 0x28, chars96
= 0;
3753 else if (c1
>= 0x2C && c1
<= 0x2F)
3754 { /* designation of DIMENSION2_CHARS96 character set */
3755 reg
= c1
- 0x2C, chars96
= 1;
3760 DECODE_DESIGNATION (reg
, 2, chars96
, c1
);
3761 /* We must update these variables now. */
3763 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3765 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3771 case 'n': /* invocation of locking-shift-2 */
3772 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3773 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3775 CODING_ISO_INVOCATION (coding
, 0) = 2;
3776 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3779 case 'o': /* invocation of locking-shift-3 */
3780 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3781 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3783 CODING_ISO_INVOCATION (coding
, 0) = 3;
3784 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3787 case 'N': /* invocation of single-shift-2 */
3788 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3789 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3791 charset_id_2
= CODING_ISO_DESIGNATION (coding
, 2);
3792 if (charset_id_2
< 0)
3793 charset
= CHARSET_FROM_ID (charset_ascii
);
3795 charset
= CHARSET_FROM_ID (charset_id_2
);
3797 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0)
3798 || (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3799 && ((CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LEVEL_4
)
3800 ? c1
>= 0x80 : c1
< 0x80)))
3804 case 'O': /* invocation of single-shift-3 */
3805 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3806 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3808 charset_id_3
= CODING_ISO_DESIGNATION (coding
, 3);
3809 if (charset_id_3
< 0)
3810 charset
= CHARSET_FROM_ID (charset_ascii
);
3812 charset
= CHARSET_FROM_ID (charset_id_3
);
3814 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0)
3815 || (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3816 && ((CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LEVEL_4
)
3817 ? c1
>= 0x80 : c1
< 0x80)))
3821 case '0': case '2': case '3': case '4': /* start composition */
3822 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3824 if (last_id
!= charset_ascii
)
3826 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3827 last_id
= charset_ascii
;
3828 last_offset
= char_offset
;
3830 DECODE_COMPOSITION_START (c1
);
3833 case '1': /* end composition */
3834 if (cmp_status
->state
== COMPOSING_NO
)
3836 DECODE_COMPOSITION_END ();
3839 case '[': /* specification of direction */
3840 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
))
3842 /* For the moment, nested direction is not supported.
3843 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3844 left-to-right, and nonzero means right-to-left. */
3848 case ']': /* end of the current direction */
3849 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3851 case '0': /* end of the current direction */
3852 case '1': /* start of left-to-right direction */
3855 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3860 case '2': /* start of right-to-left direction */
3863 coding
->mode
|= CODING_MODE_DIRECTION
;
3877 /* CTEXT extended segment:
3878 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3879 We keep these bytes as is for the moment.
3880 They may be decoded by post-read-conversion. */
3884 ONE_MORE_BYTE (dim
);
3885 if (dim
< '0' || dim
> '4')
3893 size
= ((M
- 128) * 128) + (L
- 128);
3894 if (charbuf
+ 6 > charbuf_end
)
3896 *charbuf
++ = ISO_CODE_ESC
;
3900 *charbuf
++ = BYTE8_TO_CHAR (M
);
3901 *charbuf
++ = BYTE8_TO_CHAR (L
);
3902 CODING_ISO_EXTSEGMENT_LEN (coding
) = size
;
3906 /* XFree86 extension for embedding UTF-8 in CTEXT:
3907 ESC % G --UTF-8-BYTES-- ESC % @
3908 We keep these bytes as is for the moment.
3909 They may be decoded by post-read-conversion. */
3910 if (charbuf
+ 3 > charbuf_end
)
3912 *charbuf
++ = ISO_CODE_ESC
;
3915 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 1;
3923 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3928 if (c1
>= 0x28 && c1
<= 0x2B)
3929 { /* designation of DIMENSION1_CHARS94 character set */
3930 reg
= c1
- 0x28, chars96
= 0;
3933 else if (c1
>= 0x2C && c1
<= 0x2F)
3934 { /* designation of DIMENSION1_CHARS96 character set */
3935 reg
= c1
- 0x2C, chars96
= 1;
3940 DECODE_DESIGNATION (reg
, 1, chars96
, c1
);
3941 /* We must update these variables now. */
3943 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3945 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3957 if (cmp_status
->state
== COMPOSING_NO
3958 && charset
->id
!= charset_ascii
3959 && last_id
!= charset
->id
)
3961 if (last_id
!= charset_ascii
)
3962 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3963 last_id
= charset
->id
;
3964 last_offset
= char_offset
;
3967 /* Now we know CHARSET and 1st position code C1 of a character.
3968 Produce a decoded character while getting 2nd and 3rd
3969 position codes C2, C3 if necessary. */
3970 if (CHARSET_DIMENSION (charset
) > 1)
3973 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0)
3974 || ((c1
& 0x80) != (c2
& 0x80)))
3975 /* C2 is not in a valid range. */
3977 if (CHARSET_DIMENSION (charset
) == 2)
3978 c1
= (c1
<< 8) | c2
;
3982 if (c3
< 0x20 || (c3
>= 0x80 && c3
< 0xA0)
3983 || ((c1
& 0x80) != (c3
& 0x80)))
3984 /* C3 is not in a valid range. */
3986 c1
= (c1
<< 16) | (c2
<< 8) | c2
;
3990 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3993 MAYBE_FINISH_COMPOSITION ();
3994 for (; src_base
< src
; src_base
++, char_offset
++)
3996 if (ASCII_BYTE_P (*src_base
))
3997 *charbuf
++ = *src_base
;
3999 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
4002 else if (cmp_status
->state
== COMPOSING_NO
)
4007 else if ((cmp_status
->state
== COMPOSING_CHAR
4008 ? cmp_status
->nchars
4009 : cmp_status
->ncomps
)
4010 >= MAX_COMPOSITION_COMPONENTS
)
4012 /* Too long composition. */
4013 MAYBE_FINISH_COMPOSITION ();
4018 STORE_COMPOSITION_CHAR (c
);
4022 MAYBE_FINISH_COMPOSITION ();
4024 consumed_chars
= consumed_chars_base
;
4026 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4029 /* Reset the invocation and designation status to the safest
4030 one; i.e. designate ASCII to the graphic register 0, and
4031 invoke that register to the graphic plane 0. This typically
4032 helps the case that an designation sequence for ASCII "ESC (
4033 B" is somehow broken (e.g. broken by a newline). */
4034 CODING_ISO_INVOCATION (coding
, 0) = 0;
4035 CODING_ISO_DESIGNATION (coding
, 0) = charset_ascii
;
4036 charset_id_0
= charset_ascii
;
4044 if (cmp_status
->state
!= COMPOSING_NO
)
4046 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
4047 MAYBE_FINISH_COMPOSITION ();
4050 charbuf
-= cmp_status
->length
;
4051 for (i
= 0; i
< cmp_status
->length
; i
++)
4052 cmp_status
->carryover
[i
] = charbuf
[i
];
4055 else if (last_id
!= charset_ascii
)
4056 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4057 coding
->consumed_char
+= consumed_chars_base
;
4058 coding
->consumed
= src_base
- coding
->source
;
4059 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4063 /* ISO2022 encoding stuff. */
4066 It is not enough to say just "ISO2022" on encoding, we have to
4067 specify more details. In Emacs, each coding system of ISO2022
4068 variant has the following specifications:
4069 1. Initial designation to G0 thru G3.
4070 2. Allows short-form designation?
4071 3. ASCII should be designated to G0 before control characters?
4072 4. ASCII should be designated to G0 at end of line?
4073 5. 7-bit environment or 8-bit environment?
4074 6. Use locking-shift?
4075 7. Use Single-shift?
4076 And the following two are only for Japanese:
4077 8. Use ASCII in place of JIS0201-1976-Roman?
4078 9. Use JISX0208-1983 in place of JISX0208-1978?
4079 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4080 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4084 /* Produce codes (escape sequence) for designating CHARSET to graphic
4085 register REG at DST, and increment DST. If <final-char> of CHARSET is
4086 '@', 'A', or 'B' and the coding system CODING allows, produce
4087 designation sequence of short-form. */
4089 #define ENCODE_DESIGNATION(charset, reg, coding) \
4091 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4092 const char *intermediate_char_94 = "()*+"; \
4093 const char *intermediate_char_96 = ",-./"; \
4094 int revision = -1; \
4096 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4097 revision = CHARSET_ISO_REVISION (charset); \
4099 if (revision >= 0) \
4101 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4102 EMIT_ONE_BYTE ('@' + revision); \
4104 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4105 if (CHARSET_DIMENSION (charset) == 1) \
4108 if (! CHARSET_ISO_CHARS_96 (charset)) \
4109 b = intermediate_char_94[reg]; \
4111 b = intermediate_char_96[reg]; \
4112 EMIT_ONE_ASCII_BYTE (b); \
4116 EMIT_ONE_ASCII_BYTE ('$'); \
4117 if (! CHARSET_ISO_CHARS_96 (charset)) \
4119 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4121 || final_char < '@' || final_char > 'B') \
4122 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4125 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4127 EMIT_ONE_ASCII_BYTE (final_char); \
4129 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4133 /* The following two macros produce codes (control character or escape
4134 sequence) for ISO2022 single-shift functions (single-shift-2 and
4137 #define ENCODE_SINGLE_SHIFT_2 \
4139 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4140 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4142 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4143 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4147 #define ENCODE_SINGLE_SHIFT_3 \
4149 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4150 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4152 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4153 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4157 /* The following four macros produce codes (control character or
4158 escape sequence) for ISO2022 locking-shift functions (shift-in,
4159 shift-out, locking-shift-2, and locking-shift-3). */
4161 #define ENCODE_SHIFT_IN \
4163 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4164 CODING_ISO_INVOCATION (coding, 0) = 0; \
4168 #define ENCODE_SHIFT_OUT \
4170 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4171 CODING_ISO_INVOCATION (coding, 0) = 1; \
4175 #define ENCODE_LOCKING_SHIFT_2 \
4177 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4178 CODING_ISO_INVOCATION (coding, 0) = 2; \
4182 #define ENCODE_LOCKING_SHIFT_3 \
4184 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4185 CODING_ISO_INVOCATION (coding, 0) = 3; \
4189 /* Produce codes for a DIMENSION1 character whose character set is
4190 CHARSET and whose position-code is C1. Designation and invocation
4191 sequences are also produced in advance if necessary. */
4193 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4195 int id = CHARSET_ID (charset); \
4197 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4198 && id == charset_ascii) \
4200 id = charset_jisx0201_roman; \
4201 charset = CHARSET_FROM_ID (id); \
4204 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4207 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4209 EMIT_ONE_BYTE (c1 | 0x80); \
4210 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4213 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4215 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4218 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4220 EMIT_ONE_BYTE (c1 | 0x80); \
4224 /* Since CHARSET is not yet invoked to any graphic planes, we \
4225 must invoke it, or, at first, designate it to some graphic \
4226 register. Then repeat the loop to actually produce the \
4228 dst = encode_invocation_designation (charset, coding, dst, \
4233 /* Produce codes for a DIMENSION2 character whose character set is
4234 CHARSET and whose position-codes are C1 and C2. Designation and
4235 invocation codes are also produced in advance if necessary. */
4237 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4239 int id = CHARSET_ID (charset); \
4241 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4242 && id == charset_jisx0208) \
4244 id = charset_jisx0208_1978; \
4245 charset = CHARSET_FROM_ID (id); \
4248 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4250 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4251 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4253 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4254 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4257 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4259 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4262 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4264 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4268 /* Since CHARSET is not yet invoked to any graphic planes, we \
4269 must invoke it, or, at first, designate it to some graphic \
4270 register. Then repeat the loop to actually produce the \
4272 dst = encode_invocation_designation (charset, coding, dst, \
4277 #define ENCODE_ISO_CHARACTER(charset, c) \
4280 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4282 if (CHARSET_DIMENSION (charset) == 1) \
4283 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4285 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4289 /* Produce designation and invocation codes at a place pointed by DST
4290 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4293 static unsigned char *
4294 encode_invocation_designation (struct charset
*charset
,
4295 struct coding_system
*coding
,
4296 unsigned char *dst
, ptrdiff_t *p_nchars
)
4298 bool multibytep
= coding
->dst_multibyte
;
4299 ptrdiff_t produced_chars
= *p_nchars
;
4300 int reg
; /* graphic register number */
4301 int id
= CHARSET_ID (charset
);
4303 /* At first, check designations. */
4304 for (reg
= 0; reg
< 4; reg
++)
4305 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
4310 /* CHARSET is not yet designated to any graphic registers. */
4311 /* At first check the requested designation. */
4312 reg
= CODING_ISO_REQUEST (coding
, id
);
4314 /* Since CHARSET requests no special designation, designate it
4315 to graphic register 0. */
4318 ENCODE_DESIGNATION (charset
, reg
, coding
);
4321 if (CODING_ISO_INVOCATION (coding
, 0) != reg
4322 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
4324 /* Since the graphic register REG is not invoked to any graphic
4325 planes, invoke it to graphic plane 0. */
4328 case 0: /* graphic register 0 */
4332 case 1: /* graphic register 1 */
4336 case 2: /* graphic register 2 */
4337 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4338 ENCODE_SINGLE_SHIFT_2
;
4340 ENCODE_LOCKING_SHIFT_2
;
4343 case 3: /* graphic register 3 */
4344 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4345 ENCODE_SINGLE_SHIFT_3
;
4347 ENCODE_LOCKING_SHIFT_3
;
4352 *p_nchars
= produced_chars
;
4357 /* Produce codes for designation and invocation to reset the graphic
4358 planes and registers to initial state. */
4359 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4362 struct charset *charset; \
4364 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4366 for (reg = 0; reg < 4; reg++) \
4367 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4368 && (CODING_ISO_DESIGNATION (coding, reg) \
4369 != CODING_ISO_INITIAL (coding, reg))) \
4371 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4372 ENCODE_DESIGNATION (charset, reg, coding); \
4377 /* Produce designation sequences of charsets in the line started from
4378 CHARBUF to a place pointed by DST, and return the number of
4379 produced bytes. DST should not directly point a buffer text area
4380 which may be relocated by char_charset call.
4382 If the current block ends before any end-of-line, we may fail to
4383 find all the necessary designations. */
4386 encode_designation_at_bol (struct coding_system
*coding
,
4387 int *charbuf
, int *charbuf_end
,
4390 unsigned char *orig
= dst
;
4391 struct charset
*charset
;
4392 /* Table of charsets to be designated to each graphic register. */
4394 int c
, found
= 0, reg
;
4395 ptrdiff_t produced_chars
= 0;
4396 bool multibytep
= coding
->dst_multibyte
;
4398 Lisp_Object charset_list
;
4400 attrs
= CODING_ID_ATTRS (coding
->id
);
4401 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4402 if (EQ (charset_list
, Qiso_2022
))
4403 charset_list
= Viso_2022_charset_list
;
4405 for (reg
= 0; reg
< 4; reg
++)
4408 while (charbuf
< charbuf_end
&& found
< 4)
4415 charset
= char_charset (c
, charset_list
, NULL
);
4416 id
= CHARSET_ID (charset
);
4417 reg
= CODING_ISO_REQUEST (coding
, id
);
4418 if (reg
>= 0 && r
[reg
] < 0)
4427 for (reg
= 0; reg
< 4; reg
++)
4429 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
4430 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
4436 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4439 encode_coding_iso_2022 (struct coding_system
*coding
)
4441 bool multibytep
= coding
->dst_multibyte
;
4442 int *charbuf
= coding
->charbuf
;
4443 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4444 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4445 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4447 bool bol_designation
4448 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4449 && CODING_ISO_BOL (coding
));
4450 ptrdiff_t produced_chars
= 0;
4451 Lisp_Object attrs
, eol_type
, charset_list
;
4452 bool ascii_compatible
;
4454 int preferred_charset_id
= -1;
4456 CODING_GET_INFO (coding
, attrs
, charset_list
);
4457 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
4458 if (VECTORP (eol_type
))
4461 setup_iso_safe_charsets (attrs
);
4462 /* Charset list may have been changed. */
4463 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4464 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
4467 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
))
4468 && ! (CODING_ISO_FLAGS (coding
) & (CODING_ISO_FLAG_DESIGNATION
4469 | CODING_ISO_FLAG_LOCKING_SHIFT
)));
4471 while (charbuf
< charbuf_end
)
4473 ASSURE_DESTINATION (safe_room
);
4475 if (bol_designation
)
4477 /* We have to produce designation sequences if any now. */
4478 unsigned char desig_buf
[16];
4482 charset_map_loaded
= 0;
4483 nbytes
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
,
4485 if (charset_map_loaded
4486 && (offset
= coding_change_destination (coding
)))
4491 memcpy (dst
, desig_buf
, nbytes
);
4493 /* We are sure that designation sequences are all ASCII bytes. */
4494 produced_chars
+= nbytes
;
4495 bol_designation
= 0;
4496 ASSURE_DESTINATION (safe_room
);
4503 /* Handle an annotation. */
4506 case CODING_ANNOTATE_COMPOSITION_MASK
:
4507 /* Not yet implemented. */
4509 case CODING_ANNOTATE_CHARSET_MASK
:
4510 preferred_charset_id
= charbuf
[2];
4511 if (preferred_charset_id
>= 0
4512 && NILP (Fmemq (make_number (preferred_charset_id
),
4514 preferred_charset_id
= -1;
4523 /* Now encode the character C. */
4524 if (c
< 0x20 || c
== 0x7F)
4527 || (c
== '\r' && EQ (eol_type
, Qmac
)))
4529 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4530 ENCODE_RESET_PLANE_AND_REGISTER ();
4531 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
4535 for (i
= 0; i
< 4; i
++)
4536 CODING_ISO_DESIGNATION (coding
, i
)
4537 = CODING_ISO_INITIAL (coding
, i
);
4539 bol_designation
= ((CODING_ISO_FLAGS (coding
)
4540 & CODING_ISO_FLAG_DESIGNATE_AT_BOL
)
4543 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
4544 ENCODE_RESET_PLANE_AND_REGISTER ();
4545 EMIT_ONE_ASCII_BYTE (c
);
4547 else if (ASCII_CHAR_P (c
))
4549 if (ascii_compatible
)
4550 EMIT_ONE_ASCII_BYTE (c
);
4553 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
4554 ENCODE_ISO_CHARACTER (charset
, c
);
4557 else if (CHAR_BYTE8_P (c
))
4559 c
= CHAR_TO_BYTE8 (c
);
4564 struct charset
*charset
;
4566 if (preferred_charset_id
>= 0)
4570 charset
= CHARSET_FROM_ID (preferred_charset_id
);
4571 CODING_CHAR_CHARSET_P (coding
, dst
, dst_end
, c
, charset
, result
);
4573 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4577 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
4581 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4583 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4584 charset
= CHARSET_FROM_ID (charset_ascii
);
4588 c
= coding
->default_char
;
4589 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
4590 charset_list
, NULL
, charset
);
4593 ENCODE_ISO_CHARACTER (charset
, c
);
4597 if (coding
->mode
& CODING_MODE_LAST_BLOCK
4598 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4600 ASSURE_DESTINATION (safe_room
);
4601 ENCODE_RESET_PLANE_AND_REGISTER ();
4603 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4604 CODING_ISO_BOL (coding
) = bol_designation
;
4605 coding
->produced_char
+= produced_chars
;
4606 coding
->produced
= dst
- coding
->destination
;
4611 /*** 8,9. SJIS and BIG5 handlers ***/
4613 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4614 quite widely. So, for the moment, Emacs supports them in the bare
4615 C code. But, in the future, they may be supported only by CCL. */
4617 /* SJIS is a coding system encoding three character sets: ASCII, right
4618 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4619 as is. A character of charset katakana-jisx0201 is encoded by
4620 "position-code + 0x80". A character of charset japanese-jisx0208
4621 is encoded in 2-byte but two position-codes are divided and shifted
4622 so that it fit in the range below.
4624 --- CODE RANGE of SJIS ---
4625 (character set) (range)
4627 KATAKANA-JISX0201 0xA0 .. 0xDF
4628 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4629 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4630 -------------------------------
4634 /* BIG5 is a coding system encoding two character sets: ASCII and
4635 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4636 character set and is encoded in two-byte.
4638 --- CODE RANGE of BIG5 ---
4639 (character set) (range)
4641 Big5 (1st byte) 0xA1 .. 0xFE
4642 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4643 --------------------------
4647 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4648 Return true if a text is encoded in SJIS. */
4651 detect_coding_sjis (struct coding_system
*coding
,
4652 struct coding_detection_info
*detect_info
)
4654 const unsigned char *src
= coding
->source
, *src_base
;
4655 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4656 bool multibytep
= coding
->src_multibyte
;
4657 ptrdiff_t consumed_chars
= 0;
4660 Lisp_Object attrs
, charset_list
;
4661 int max_first_byte_of_2_byte_code
;
4663 CODING_GET_INFO (coding
, attrs
, charset_list
);
4664 max_first_byte_of_2_byte_code
4665 = (XINT (Flength (charset_list
)) > 3 ? 0xFC : 0xEF);
4667 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
4668 /* A coding system of this category is always ASCII compatible. */
4669 src
+= coding
->head_ascii
;
4677 if ((c
>= 0x81 && c
<= 0x9F)
4678 || (c
>= 0xE0 && c
<= max_first_byte_of_2_byte_code
))
4681 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
4683 found
= CATEGORY_MASK_SJIS
;
4685 else if (c
>= 0xA0 && c
< 0xE0)
4686 found
= CATEGORY_MASK_SJIS
;
4690 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4694 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4696 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4699 detect_info
->found
|= found
;
4703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4704 Return true if a text is encoded in BIG5. */
4707 detect_coding_big5 (struct coding_system
*coding
,
4708 struct coding_detection_info
*detect_info
)
4710 const unsigned char *src
= coding
->source
, *src_base
;
4711 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4712 bool multibytep
= coding
->src_multibyte
;
4713 ptrdiff_t consumed_chars
= 0;
4717 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
4718 /* A coding system of this category is always ASCII compatible. */
4719 src
+= coding
->head_ascii
;
4730 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
4732 found
= CATEGORY_MASK_BIG5
;
4737 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4741 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4743 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4746 detect_info
->found
|= found
;
4750 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4753 decode_coding_sjis (struct coding_system
*coding
)
4755 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4756 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4757 const unsigned char *src_base
;
4758 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4759 /* We may produce one charset annotation in one loop and one more at
4762 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4763 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
4764 bool multibytep
= coding
->src_multibyte
;
4765 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4766 struct charset
*charset_kanji2
;
4767 Lisp_Object attrs
, charset_list
, val
;
4768 ptrdiff_t char_offset
= coding
->produced_char
;
4769 ptrdiff_t last_offset
= char_offset
;
4770 int last_id
= charset_ascii
;
4772 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4773 int byte_after_cr
= -1;
4775 CODING_GET_INFO (coding
, attrs
, charset_list
);
4778 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4779 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4780 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4781 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4786 struct charset
*charset
;
4789 consumed_chars_base
= consumed_chars
;
4791 if (charbuf
>= charbuf_end
)
4793 if (byte_after_cr
>= 0)
4798 if (byte_after_cr
>= 0)
4799 c
= byte_after_cr
, byte_after_cr
= -1;
4806 if (eol_dos
&& c
== '\r')
4807 ONE_MORE_BYTE (byte_after_cr
);
4808 charset
= charset_roman
;
4810 else if (c
== 0x80 || c
== 0xA0)
4812 else if (c
>= 0xA1 && c
<= 0xDF)
4814 /* SJIS -> JISX0201-Kana */
4816 charset
= charset_kana
;
4820 /* SJIS -> JISX0208 */
4822 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4826 charset
= charset_kanji
;
4828 else if (c
<= 0xFC && charset_kanji2
)
4830 /* SJIS -> JISX0213-2 */
4832 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4836 charset
= charset_kanji2
;
4840 if (charset
->id
!= charset_ascii
4841 && last_id
!= charset
->id
)
4843 if (last_id
!= charset_ascii
)
4844 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4845 last_id
= charset
->id
;
4846 last_offset
= char_offset
;
4848 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4855 consumed_chars
= consumed_chars_base
;
4857 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4863 if (last_id
!= charset_ascii
)
4864 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4865 coding
->consumed_char
+= consumed_chars_base
;
4866 coding
->consumed
= src_base
- coding
->source
;
4867 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4871 decode_coding_big5 (struct coding_system
*coding
)
4873 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4874 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4875 const unsigned char *src_base
;
4876 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4877 /* We may produce one charset annotation in one loop and one more at
4880 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4881 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
4882 bool multibytep
= coding
->src_multibyte
;
4883 struct charset
*charset_roman
, *charset_big5
;
4884 Lisp_Object attrs
, charset_list
, val
;
4885 ptrdiff_t char_offset
= coding
->produced_char
;
4886 ptrdiff_t last_offset
= char_offset
;
4887 int last_id
= charset_ascii
;
4889 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4890 int byte_after_cr
= -1;
4892 CODING_GET_INFO (coding
, attrs
, charset_list
);
4894 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4895 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4900 struct charset
*charset
;
4903 consumed_chars_base
= consumed_chars
;
4905 if (charbuf
>= charbuf_end
)
4907 if (byte_after_cr
>= 0)
4912 if (byte_after_cr
>= 0)
4913 c
= byte_after_cr
, byte_after_cr
= -1;
4921 if (eol_dos
&& c
== '\r')
4922 ONE_MORE_BYTE (byte_after_cr
);
4923 charset
= charset_roman
;
4928 if (c
< 0xA1 || c
> 0xFE)
4931 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4934 charset
= charset_big5
;
4936 if (charset
->id
!= charset_ascii
4937 && last_id
!= charset
->id
)
4939 if (last_id
!= charset_ascii
)
4940 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4941 last_id
= charset
->id
;
4942 last_offset
= char_offset
;
4944 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4951 consumed_chars
= consumed_chars_base
;
4953 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4959 if (last_id
!= charset_ascii
)
4960 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4961 coding
->consumed_char
+= consumed_chars_base
;
4962 coding
->consumed
= src_base
- coding
->source
;
4963 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4966 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4967 This function can encode charsets `ascii', `katakana-jisx0201',
4968 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4969 are sure that all these charsets are registered as official charset
4970 (i.e. do not have extended leading-codes). Characters of other
4971 charsets are produced without any encoding. */
4974 encode_coding_sjis (struct coding_system
*coding
)
4976 bool multibytep
= coding
->dst_multibyte
;
4977 int *charbuf
= coding
->charbuf
;
4978 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4979 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4980 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4982 ptrdiff_t produced_chars
= 0;
4983 Lisp_Object attrs
, charset_list
, val
;
4984 bool ascii_compatible
;
4985 struct charset
*charset_kanji
, *charset_kana
;
4986 struct charset
*charset_kanji2
;
4989 CODING_GET_INFO (coding
, attrs
, charset_list
);
4990 val
= XCDR (charset_list
);
4991 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4992 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4993 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4995 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4997 while (charbuf
< charbuf_end
)
4999 ASSURE_DESTINATION (safe_room
);
5001 /* Now encode the character C. */
5002 if (ASCII_CHAR_P (c
) && ascii_compatible
)
5003 EMIT_ONE_ASCII_BYTE (c
);
5004 else if (CHAR_BYTE8_P (c
))
5006 c
= CHAR_TO_BYTE8 (c
);
5012 struct charset
*charset
;
5013 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
5018 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5020 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5021 charset
= CHARSET_FROM_ID (charset_ascii
);
5025 c
= coding
->default_char
;
5026 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
5027 charset_list
, &code
, charset
);
5030 if (code
== CHARSET_INVALID_CODE (charset
))
5032 if (charset
== charset_kanji
)
5036 c1
= code
>> 8, c2
= code
& 0xFF;
5037 EMIT_TWO_BYTES (c1
, c2
);
5039 else if (charset
== charset_kana
)
5040 EMIT_ONE_BYTE (code
| 0x80);
5041 else if (charset_kanji2
&& charset
== charset_kanji2
)
5046 if (c1
== 0x21 || (c1
>= 0x23 && c1
<= 0x25)
5048 || (c1
>= 0x2C && c1
<= 0x2F) || c1
>= 0x6E)
5050 JIS_TO_SJIS2 (code
);
5051 c1
= code
>> 8, c2
= code
& 0xFF;
5052 EMIT_TWO_BYTES (c1
, c2
);
5055 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
5058 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
5061 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5062 coding
->produced_char
+= produced_chars
;
5063 coding
->produced
= dst
- coding
->destination
;
5068 encode_coding_big5 (struct coding_system
*coding
)
5070 bool multibytep
= coding
->dst_multibyte
;
5071 int *charbuf
= coding
->charbuf
;
5072 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5073 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5074 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5076 ptrdiff_t produced_chars
= 0;
5077 Lisp_Object attrs
, charset_list
, val
;
5078 bool ascii_compatible
;
5079 struct charset
*charset_big5
;
5082 CODING_GET_INFO (coding
, attrs
, charset_list
);
5083 val
= XCDR (charset_list
);
5084 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
5085 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
5087 while (charbuf
< charbuf_end
)
5089 ASSURE_DESTINATION (safe_room
);
5091 /* Now encode the character C. */
5092 if (ASCII_CHAR_P (c
) && ascii_compatible
)
5093 EMIT_ONE_ASCII_BYTE (c
);
5094 else if (CHAR_BYTE8_P (c
))
5096 c
= CHAR_TO_BYTE8 (c
);
5102 struct charset
*charset
;
5103 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
5108 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5110 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5111 charset
= CHARSET_FROM_ID (charset_ascii
);
5115 c
= coding
->default_char
;
5116 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
,
5117 charset_list
, &code
, charset
);
5120 if (code
== CHARSET_INVALID_CODE (charset
))
5122 if (charset
== charset_big5
)
5126 c1
= code
>> 8, c2
= code
& 0xFF;
5127 EMIT_TWO_BYTES (c1
, c2
);
5130 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
5133 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5134 coding
->produced_char
+= produced_chars
;
5135 coding
->produced
= dst
- coding
->destination
;
5140 /*** 10. CCL handlers ***/
5142 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5143 Return true if a text is encoded in a coding system of which
5144 encoder/decoder are written in CCL program. */
5147 detect_coding_ccl (struct coding_system
*coding
,
5148 struct coding_detection_info
*detect_info
)
5150 const unsigned char *src
= coding
->source
, *src_base
;
5151 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5152 bool multibytep
= coding
->src_multibyte
;
5153 ptrdiff_t consumed_chars
= 0;
5155 unsigned char *valids
;
5156 ptrdiff_t head_ascii
= coding
->head_ascii
;
5159 detect_info
->checked
|= CATEGORY_MASK_CCL
;
5161 coding
= &coding_categories
[coding_category_ccl
];
5162 valids
= CODING_CCL_VALIDS (coding
);
5163 attrs
= CODING_ID_ATTRS (coding
->id
);
5164 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5173 if (c
< 0 || ! valids
[c
])
5175 if ((valids
[c
] > 1))
5176 found
= CATEGORY_MASK_CCL
;
5178 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
5182 detect_info
->found
|= found
;
5187 decode_coding_ccl (struct coding_system
*coding
)
5189 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5190 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5191 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5192 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
5193 ptrdiff_t consumed_chars
= 0;
5194 bool multibytep
= coding
->src_multibyte
;
5195 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5196 int source_charbuf
[1024];
5197 int source_byteidx
[1025];
5198 Lisp_Object attrs
, charset_list
;
5200 CODING_GET_INFO (coding
, attrs
, charset_list
);
5204 const unsigned char *p
= src
;
5210 while (i
< 1024 && p
< src_end
)
5212 source_byteidx
[i
] = p
- src
;
5213 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
5215 source_byteidx
[i
] = p
- src
;
5218 while (i
< 1024 && p
< src_end
)
5219 source_charbuf
[i
++] = *p
++;
5221 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
5222 ccl
->last_block
= true;
5223 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5224 charset_map_loaded
= 0;
5225 ccl_driver (ccl
, source_charbuf
, charbuf
, i
, charbuf_end
- charbuf
,
5227 if (charset_map_loaded
5228 && (offset
= coding_change_source (coding
)))
5234 charbuf
+= ccl
->produced
;
5236 src
+= source_byteidx
[ccl
->consumed
];
5238 src
+= ccl
->consumed
;
5239 consumed_chars
+= ccl
->consumed
;
5240 if (p
== src_end
|| ccl
->status
!= CCL_STAT_SUSPEND_BY_SRC
)
5244 switch (ccl
->status
)
5246 case CCL_STAT_SUSPEND_BY_SRC
:
5247 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5249 case CCL_STAT_SUSPEND_BY_DST
:
5250 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5253 case CCL_STAT_INVALID_CMD
:
5254 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5257 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5260 coding
->consumed_char
+= consumed_chars
;
5261 coding
->consumed
= src
- coding
->source
;
5262 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5266 encode_coding_ccl (struct coding_system
*coding
)
5268 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5269 bool multibytep
= coding
->dst_multibyte
;
5270 int *charbuf
= coding
->charbuf
;
5271 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5272 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5273 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5274 int destination_charbuf
[1024];
5275 ptrdiff_t produced_chars
= 0;
5277 Lisp_Object attrs
, charset_list
;
5279 CODING_GET_INFO (coding
, attrs
, charset_list
);
5280 if (coding
->consumed_char
== coding
->src_chars
5281 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
5282 ccl
->last_block
= true;
5288 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5289 charset_map_loaded
= 0;
5290 ccl_driver (ccl
, charbuf
, destination_charbuf
,
5291 charbuf_end
- charbuf
, 1024, charset_list
);
5292 if (charset_map_loaded
5293 && (offset
= coding_change_destination (coding
)))
5297 ASSURE_DESTINATION (ccl
->produced
* 2);
5298 for (i
= 0; i
< ccl
->produced
; i
++)
5299 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
5303 ASSURE_DESTINATION (ccl
->produced
);
5304 for (i
= 0; i
< ccl
->produced
; i
++)
5305 *dst
++ = destination_charbuf
[i
] & 0xFF;
5306 produced_chars
+= ccl
->produced
;
5308 charbuf
+= ccl
->consumed
;
5309 if (ccl
->status
== CCL_STAT_QUIT
5310 || ccl
->status
== CCL_STAT_INVALID_CMD
)
5313 while (charbuf
< charbuf_end
);
5315 switch (ccl
->status
)
5317 case CCL_STAT_SUSPEND_BY_SRC
:
5318 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5320 case CCL_STAT_SUSPEND_BY_DST
:
5321 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5324 case CCL_STAT_INVALID_CMD
:
5325 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5328 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5332 coding
->produced_char
+= produced_chars
;
5333 coding
->produced
= dst
- coding
->destination
;
5338 /*** 10, 11. no-conversion handlers ***/
5340 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5343 decode_coding_raw_text (struct coding_system
*coding
)
5346 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5348 coding
->chars_at_source
= 1;
5349 coding
->consumed_char
= coding
->src_chars
;
5350 coding
->consumed
= coding
->src_bytes
;
5351 if (eol_dos
&& coding
->source
[coding
->src_bytes
- 1] == '\r')
5353 coding
->consumed_char
--;
5355 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5358 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5362 encode_coding_raw_text (struct coding_system
*coding
)
5364 bool multibytep
= coding
->dst_multibyte
;
5365 int *charbuf
= coding
->charbuf
;
5366 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
5367 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5368 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5369 ptrdiff_t produced_chars
= 0;
5374 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
5376 if (coding
->src_multibyte
)
5377 while (charbuf
< charbuf_end
)
5379 ASSURE_DESTINATION (safe_room
);
5381 if (ASCII_CHAR_P (c
))
5382 EMIT_ONE_ASCII_BYTE (c
);
5383 else if (CHAR_BYTE8_P (c
))
5385 c
= CHAR_TO_BYTE8 (c
);
5390 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
5392 CHAR_STRING_ADVANCE (c
, p1
);
5395 EMIT_ONE_BYTE (*p0
);
5402 while (charbuf
< charbuf_end
)
5404 ASSURE_DESTINATION (safe_room
);
5411 if (coding
->src_multibyte
)
5413 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5415 while (charbuf
< charbuf_end
)
5417 ASSURE_DESTINATION (safe_room
);
5419 if (ASCII_CHAR_P (c
))
5421 else if (CHAR_BYTE8_P (c
))
5422 *dst
++ = CHAR_TO_BYTE8 (c
);
5424 CHAR_STRING_ADVANCE (c
, dst
);
5429 ASSURE_DESTINATION (charbuf_end
- charbuf
);
5430 while (charbuf
< charbuf_end
&& dst
< dst_end
)
5431 *dst
++ = *charbuf
++;
5433 produced_chars
= dst
- (coding
->destination
+ coding
->produced
);
5435 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5436 coding
->produced_char
+= produced_chars
;
5437 coding
->produced
= dst
- coding
->destination
;
5441 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5442 Return true if a text is encoded in a charset-based coding system. */
5445 detect_coding_charset (struct coding_system
*coding
,
5446 struct coding_detection_info
*detect_info
)
5448 const unsigned char *src
= coding
->source
, *src_base
;
5449 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5450 bool multibytep
= coding
->src_multibyte
;
5451 ptrdiff_t consumed_chars
= 0;
5452 Lisp_Object attrs
, valids
, name
;
5454 ptrdiff_t head_ascii
= coding
->head_ascii
;
5455 bool check_latin_extra
= 0;
5457 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
5459 coding
= &coding_categories
[coding_category_charset
];
5460 attrs
= CODING_ID_ATTRS (coding
->id
);
5461 valids
= AREF (attrs
, coding_attr_charset_valids
);
5462 name
= CODING_ID_NAME (coding
->id
);
5463 if (strncmp (SSDATA (SYMBOL_NAME (name
)),
5464 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5465 || strncmp (SSDATA (SYMBOL_NAME (name
)),
5466 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5467 check_latin_extra
= 1;
5469 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5476 struct charset
*charset
;
5483 val
= AREF (valids
, c
);
5489 && check_latin_extra
5490 && (!VECTORP (Vlatin_extra_code_table
)
5491 || NILP (AREF (Vlatin_extra_code_table
, c
))))
5493 found
= CATEGORY_MASK_CHARSET
;
5497 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5498 dim
= CHARSET_DIMENSION (charset
);
5499 for (idx
= 1; idx
< dim
; idx
++)
5504 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
5505 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
5514 for (; CONSP (val
); val
= XCDR (val
))
5516 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5517 dim
= CHARSET_DIMENSION (charset
);
5523 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
5524 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
5539 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
5543 detect_info
->found
|= found
;
5548 decode_coding_charset (struct coding_system
*coding
)
5550 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5551 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5552 const unsigned char *src_base
;
5553 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5554 /* We may produce one charset annotation in one loop and one more at
5557 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
5558 ptrdiff_t consumed_chars
= 0, consumed_chars_base
;
5559 bool multibytep
= coding
->src_multibyte
;
5560 Lisp_Object attrs
= CODING_ID_ATTRS (coding
->id
);
5562 ptrdiff_t char_offset
= coding
->produced_char
;
5563 ptrdiff_t last_offset
= char_offset
;
5564 int last_id
= charset_ascii
;
5566 = !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5567 int byte_after_cr
= -1;
5569 valids
= AREF (attrs
, coding_attr_charset_valids
);
5575 struct charset
*charset
;
5581 consumed_chars_base
= consumed_chars
;
5583 if (charbuf
>= charbuf_end
)
5585 if (byte_after_cr
>= 0)
5590 if (byte_after_cr
>= 0)
5598 if (eol_dos
&& c
== '\r')
5599 ONE_MORE_BYTE (byte_after_cr
);
5605 val
= AREF (valids
, c
);
5606 if (! INTEGERP (val
) && ! CONSP (val
))
5610 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5611 dim
= CHARSET_DIMENSION (charset
);
5615 code
= (code
<< 8) | c
;
5618 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
5623 /* VAL is a list of charset IDs. It is assured that the
5624 list is sorted by charset dimensions (smaller one
5628 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5629 dim
= CHARSET_DIMENSION (charset
);
5633 code
= (code
<< 8) | c
;
5636 CODING_DECODE_CHAR (coding
, src
, src_base
,
5637 src_end
, charset
, code
, c
);
5645 if (charset
->id
!= charset_ascii
5646 && last_id
!= charset
->id
)
5648 if (last_id
!= charset_ascii
)
5649 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5650 last_id
= charset
->id
;
5651 last_offset
= char_offset
;
5660 consumed_chars
= consumed_chars_base
;
5662 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
5668 if (last_id
!= charset_ascii
)
5669 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5670 coding
->consumed_char
+= consumed_chars_base
;
5671 coding
->consumed
= src_base
- coding
->source
;
5672 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5676 encode_coding_charset (struct coding_system
*coding
)
5678 bool multibytep
= coding
->dst_multibyte
;
5679 int *charbuf
= coding
->charbuf
;
5680 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5681 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5682 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5683 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5684 ptrdiff_t produced_chars
= 0;
5685 Lisp_Object attrs
, charset_list
;
5686 bool ascii_compatible
;
5689 CODING_GET_INFO (coding
, attrs
, charset_list
);
5690 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
5692 while (charbuf
< charbuf_end
)
5694 struct charset
*charset
;
5697 ASSURE_DESTINATION (safe_room
);
5699 if (ascii_compatible
&& ASCII_CHAR_P (c
))
5700 EMIT_ONE_ASCII_BYTE (c
);
5701 else if (CHAR_BYTE8_P (c
))
5703 c
= CHAR_TO_BYTE8 (c
);
5708 CODING_CHAR_CHARSET (coding
, dst
, dst_end
, c
, charset_list
,
5713 if (CHARSET_DIMENSION (charset
) == 1)
5714 EMIT_ONE_BYTE (code
);
5715 else if (CHARSET_DIMENSION (charset
) == 2)
5716 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
5717 else if (CHARSET_DIMENSION (charset
) == 3)
5718 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
5720 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
5721 (code
>> 8) & 0xFF, code
& 0xFF);
5725 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5726 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5728 c
= coding
->default_char
;
5734 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5735 coding
->produced_char
+= produced_chars
;
5736 coding
->produced
= dst
- coding
->destination
;
5741 /*** 7. C library functions ***/
5743 /* Setup coding context CODING from information about CODING_SYSTEM.
5744 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5745 CODING_SYSTEM is invalid, signal an error. */
5748 setup_coding_system (Lisp_Object coding_system
, struct coding_system
*coding
)
5751 Lisp_Object eol_type
;
5752 Lisp_Object coding_type
;
5755 if (NILP (coding_system
))
5756 coding_system
= Qundecided
;
5758 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
5760 attrs
= CODING_ID_ATTRS (coding
->id
);
5761 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
5764 if (VECTORP (eol_type
))
5765 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5766 | CODING_REQUIRE_DETECTION_MASK
);
5767 else if (! EQ (eol_type
, Qunix
))
5768 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5769 | CODING_REQUIRE_ENCODING_MASK
);
5771 coding
->common_flags
= 0;
5772 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5773 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5774 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5775 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5776 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
5777 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
5779 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5780 coding
->max_charset_id
= SCHARS (val
) - 1;
5781 coding
->safe_charsets
= SDATA (val
);
5782 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
5783 coding
->carryover_bytes
= 0;
5784 coding
->raw_destination
= 0;
5786 coding_type
= CODING_ATTR_TYPE (attrs
);
5787 if (EQ (coding_type
, Qundecided
))
5789 coding
->detector
= NULL
;
5790 coding
->decoder
= decode_coding_raw_text
;
5791 coding
->encoder
= encode_coding_raw_text
;
5792 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5793 coding
->spec
.undecided
.inhibit_nbd
5794 = (encode_inhibit_flag
5795 (AREF (attrs
, coding_attr_undecided_inhibit_null_byte_detection
)));
5796 coding
->spec
.undecided
.inhibit_ied
5797 = (encode_inhibit_flag
5798 (AREF (attrs
, coding_attr_undecided_inhibit_iso_escape_detection
)));
5799 coding
->spec
.undecided
.prefer_utf_8
5800 = ! NILP (AREF (attrs
, coding_attr_undecided_prefer_utf_8
));
5802 else if (EQ (coding_type
, Qiso_2022
))
5805 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5807 /* Invoke graphic register 0 to plane 0. */
5808 CODING_ISO_INVOCATION (coding
, 0) = 0;
5809 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5810 CODING_ISO_INVOCATION (coding
, 1)
5811 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
5812 /* Setup the initial status of designation. */
5813 for (i
= 0; i
< 4; i
++)
5814 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
5815 /* Not single shifting initially. */
5816 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
5817 /* Beginning of buffer should also be regarded as bol. */
5818 CODING_ISO_BOL (coding
) = 1;
5819 coding
->detector
= detect_coding_iso_2022
;
5820 coding
->decoder
= decode_coding_iso_2022
;
5821 coding
->encoder
= encode_coding_iso_2022
;
5822 if (flags
& CODING_ISO_FLAG_SAFE
)
5823 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
5824 coding
->common_flags
5825 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5826 | CODING_REQUIRE_FLUSHING_MASK
);
5827 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
5828 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
5829 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
5830 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
5831 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5833 setup_iso_safe_charsets (attrs
);
5834 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5835 coding
->max_charset_id
= SCHARS (val
) - 1;
5836 coding
->safe_charsets
= SDATA (val
);
5838 CODING_ISO_FLAGS (coding
) = flags
;
5839 CODING_ISO_CMP_STATUS (coding
)->state
= COMPOSING_NO
;
5840 CODING_ISO_CMP_STATUS (coding
)->method
= COMPOSITION_NO
;
5841 CODING_ISO_EXTSEGMENT_LEN (coding
) = 0;
5842 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
5844 else if (EQ (coding_type
, Qcharset
))
5846 coding
->detector
= detect_coding_charset
;
5847 coding
->decoder
= decode_coding_charset
;
5848 coding
->encoder
= encode_coding_charset
;
5849 coding
->common_flags
5850 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5852 else if (EQ (coding_type
, Qutf_8
))
5854 val
= AREF (attrs
, coding_attr_utf_bom
);
5855 CODING_UTF_8_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5856 : EQ (val
, Qt
) ? utf_with_bom
5858 coding
->detector
= detect_coding_utf_8
;
5859 coding
->decoder
= decode_coding_utf_8
;
5860 coding
->encoder
= encode_coding_utf_8
;
5861 coding
->common_flags
5862 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5863 if (CODING_UTF_8_BOM (coding
) == utf_detect_bom
)
5864 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5866 else if (EQ (coding_type
, Qutf_16
))
5868 val
= AREF (attrs
, coding_attr_utf_bom
);
5869 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5870 : EQ (val
, Qt
) ? utf_with_bom
5872 val
= AREF (attrs
, coding_attr_utf_16_endian
);
5873 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
5874 : utf_16_little_endian
);
5875 CODING_UTF_16_SURROGATE (coding
) = 0;
5876 coding
->detector
= detect_coding_utf_16
;
5877 coding
->decoder
= decode_coding_utf_16
;
5878 coding
->encoder
= encode_coding_utf_16
;
5879 coding
->common_flags
5880 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5881 if (CODING_UTF_16_BOM (coding
) == utf_detect_bom
)
5882 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5884 else if (EQ (coding_type
, Qccl
))
5886 coding
->detector
= detect_coding_ccl
;
5887 coding
->decoder
= decode_coding_ccl
;
5888 coding
->encoder
= encode_coding_ccl
;
5889 coding
->common_flags
5890 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5891 | CODING_REQUIRE_FLUSHING_MASK
);
5893 else if (EQ (coding_type
, Qemacs_mule
))
5895 coding
->detector
= detect_coding_emacs_mule
;
5896 coding
->decoder
= decode_coding_emacs_mule
;
5897 coding
->encoder
= encode_coding_emacs_mule
;
5898 coding
->common_flags
5899 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5900 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
5901 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
5903 Lisp_Object tail
, safe_charsets
;
5904 int max_charset_id
= 0;
5906 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5908 if (max_charset_id
< XFASTINT (XCAR (tail
)))
5909 max_charset_id
= XFASTINT (XCAR (tail
));
5910 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
5911 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
5912 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5914 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
5915 coding
->max_charset_id
= max_charset_id
;
5916 coding
->safe_charsets
= SDATA (safe_charsets
);
5918 coding
->spec
.emacs_mule
.cmp_status
.state
= COMPOSING_NO
;
5919 coding
->spec
.emacs_mule
.cmp_status
.method
= COMPOSITION_NO
;
5921 else if (EQ (coding_type
, Qshift_jis
))
5923 coding
->detector
= detect_coding_sjis
;
5924 coding
->decoder
= decode_coding_sjis
;
5925 coding
->encoder
= encode_coding_sjis
;
5926 coding
->common_flags
5927 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5929 else if (EQ (coding_type
, Qbig5
))
5931 coding
->detector
= detect_coding_big5
;
5932 coding
->decoder
= decode_coding_big5
;
5933 coding
->encoder
= encode_coding_big5
;
5934 coding
->common_flags
5935 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5937 else /* EQ (coding_type, Qraw_text) */
5939 coding
->detector
= NULL
;
5940 coding
->decoder
= decode_coding_raw_text
;
5941 coding
->encoder
= encode_coding_raw_text
;
5942 if (! EQ (eol_type
, Qunix
))
5944 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5945 if (! VECTORP (eol_type
))
5946 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5954 /* Return a list of charsets supported by CODING. */
5957 coding_charset_list (struct coding_system
*coding
)
5959 Lisp_Object attrs
, charset_list
;
5961 CODING_GET_INFO (coding
, attrs
, charset_list
);
5962 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5964 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5966 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5967 charset_list
= Viso_2022_charset_list
;
5969 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5971 charset_list
= Vemacs_mule_charset_list
;
5973 return charset_list
;
5977 /* Return a list of charsets supported by CODING-SYSTEM. */
5980 coding_system_charset_list (Lisp_Object coding_system
)
5983 Lisp_Object attrs
, charset_list
;
5985 CHECK_CODING_SYSTEM_GET_ID (coding_system
, id
);
5986 attrs
= CODING_ID_ATTRS (id
);
5988 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5990 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5992 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5993 charset_list
= Viso_2022_charset_list
;
5995 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
5997 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5999 charset_list
= Vemacs_mule_charset_list
;
6003 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6005 return charset_list
;
6009 /* Return raw-text or one of its subsidiaries that has the same
6010 eol_type as CODING-SYSTEM. */
6013 raw_text_coding_system (Lisp_Object coding_system
)
6015 Lisp_Object spec
, attrs
;
6016 Lisp_Object eol_type
, raw_text_eol_type
;
6018 if (NILP (coding_system
))
6020 spec
= CODING_SYSTEM_SPEC (coding_system
);
6021 attrs
= AREF (spec
, 0);
6023 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
6024 return coding_system
;
6026 eol_type
= AREF (spec
, 2);
6027 if (VECTORP (eol_type
))
6029 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
6030 raw_text_eol_type
= AREF (spec
, 2);
6031 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
6032 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
6033 : AREF (raw_text_eol_type
, 2));
6037 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6038 the subsidiary that has the same eol-spec as PARENT (if it is not
6039 nil and specifies end-of-line format) or the system's setting
6040 (system_eol_type). */
6043 coding_inherit_eol_type (Lisp_Object coding_system
, Lisp_Object parent
)
6045 Lisp_Object spec
, eol_type
;
6047 if (NILP (coding_system
))
6048 coding_system
= Qraw_text
;
6049 spec
= CODING_SYSTEM_SPEC (coding_system
);
6050 eol_type
= AREF (spec
, 2);
6051 if (VECTORP (eol_type
))
6053 Lisp_Object parent_eol_type
;
6055 if (! NILP (parent
))
6057 Lisp_Object parent_spec
;
6059 parent_spec
= CODING_SYSTEM_SPEC (parent
);
6060 parent_eol_type
= AREF (parent_spec
, 2);
6061 if (VECTORP (parent_eol_type
))
6062 parent_eol_type
= system_eol_type
;
6065 parent_eol_type
= system_eol_type
;
6066 if (EQ (parent_eol_type
, Qunix
))
6067 coding_system
= AREF (eol_type
, 0);
6068 else if (EQ (parent_eol_type
, Qdos
))
6069 coding_system
= AREF (eol_type
, 1);
6070 else if (EQ (parent_eol_type
, Qmac
))
6071 coding_system
= AREF (eol_type
, 2);
6073 return coding_system
;
6077 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6078 decided for writing to a process. If not, complement them, and
6079 return a new coding system. */
6082 complement_process_encoding_system (Lisp_Object coding_system
)
6084 Lisp_Object coding_base
= Qnil
, eol_base
= Qnil
;
6085 Lisp_Object spec
, attrs
;
6088 for (i
= 0; i
< 3; i
++)
6091 coding_system
= CDR_SAFE (Vdefault_process_coding_system
);
6093 coding_system
= preferred_coding_system ();
6094 spec
= CODING_SYSTEM_SPEC (coding_system
);
6097 attrs
= AREF (spec
, 0);
6098 if (NILP (coding_base
) && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6099 coding_base
= CODING_ATTR_BASE_NAME (attrs
);
6100 if (NILP (eol_base
) && ! VECTORP (AREF (spec
, 2)))
6101 eol_base
= coding_system
;
6102 if (! NILP (coding_base
) && ! NILP (eol_base
))
6107 /* The original CODING_SYSTEM didn't specify text-conversion or
6108 eol-conversion. Be sure that we return a fully complemented
6110 coding_system
= coding_inherit_eol_type (coding_base
, eol_base
);
6111 return coding_system
;
6115 /* Emacs has a mechanism to automatically detect a coding system if it
6116 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6117 it's impossible to distinguish some coding systems accurately
6118 because they use the same range of codes. So, at first, coding
6119 systems are categorized into 7, those are:
6121 o coding-category-emacs-mule
6123 The category for a coding system which has the same code range
6124 as Emacs' internal format. Assigned the coding-system (Lisp
6125 symbol) `emacs-mule' by default.
6127 o coding-category-sjis
6129 The category for a coding system which has the same code range
6130 as SJIS. Assigned the coding-system (Lisp
6131 symbol) `japanese-shift-jis' by default.
6133 o coding-category-iso-7
6135 The category for a coding system which has the same code range
6136 as ISO2022 of 7-bit environment. This doesn't use any locking
6137 shift and single shift functions. This can encode/decode all
6138 charsets. Assigned the coding-system (Lisp symbol)
6139 `iso-2022-7bit' by default.
6141 o coding-category-iso-7-tight
6143 Same as coding-category-iso-7 except that this can
6144 encode/decode only the specified charsets.
6146 o coding-category-iso-8-1
6148 The category for a coding system which has the same code range
6149 as ISO2022 of 8-bit environment and graphic plane 1 used only
6150 for DIMENSION1 charset. This doesn't use any locking shift
6151 and single shift functions. Assigned the coding-system (Lisp
6152 symbol) `iso-latin-1' by default.
6154 o coding-category-iso-8-2
6156 The category for a coding system which has the same code range
6157 as ISO2022 of 8-bit environment and graphic plane 1 used only
6158 for DIMENSION2 charset. This doesn't use any locking shift
6159 and single shift functions. Assigned the coding-system (Lisp
6160 symbol) `japanese-iso-8bit' by default.
6162 o coding-category-iso-7-else
6164 The category for a coding system which has the same code range
6165 as ISO2022 of 7-bit environment but uses locking shift or
6166 single shift functions. Assigned the coding-system (Lisp
6167 symbol) `iso-2022-7bit-lock' by default.
6169 o coding-category-iso-8-else
6171 The category for a coding system which has the same code range
6172 as ISO2022 of 8-bit environment but uses locking shift or
6173 single shift functions. Assigned the coding-system (Lisp
6174 symbol) `iso-2022-8bit-ss2' by default.
6176 o coding-category-big5
6178 The category for a coding system which has the same code range
6179 as BIG5. Assigned the coding-system (Lisp symbol)
6180 `cn-big5' by default.
6182 o coding-category-utf-8
6184 The category for a coding system which has the same code range
6185 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6186 symbol) `utf-8' by default.
6188 o coding-category-utf-16-be
6190 The category for a coding system in which a text has an
6191 Unicode signature (cf. Unicode Standard) in the order of BIG
6192 endian at the head. Assigned the coding-system (Lisp symbol)
6193 `utf-16-be' by default.
6195 o coding-category-utf-16-le
6197 The category for a coding system in which a text has an
6198 Unicode signature (cf. Unicode Standard) in the order of
6199 LITTLE endian at the head. Assigned the coding-system (Lisp
6200 symbol) `utf-16-le' by default.
6202 o coding-category-ccl
6204 The category for a coding system of which encoder/decoder is
6205 written in CCL programs. The default value is nil, i.e., no
6206 coding system is assigned.
6208 o coding-category-binary
6210 The category for a coding system not categorized in any of the
6211 above. Assigned the coding-system (Lisp symbol)
6212 `no-conversion' by default.
6214 Each of them is a Lisp symbol and the value is an actual
6215 `coding-system's (this is also a Lisp symbol) assigned by a user.
6216 What Emacs does actually is to detect a category of coding system.
6217 Then, it uses a `coding-system' assigned to it. If Emacs can't
6218 decide only one possible category, it selects a category of the
6219 highest priority. Priorities of categories are also specified by a
6220 user in a Lisp variable `coding-category-list'.
6224 static Lisp_Object
adjust_coding_eol_type (struct coding_system
*coding
,
6228 /* Return the number of ASCII characters at the head of the source.
6229 By side effects, set coding->head_ascii and update
6230 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6231 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6232 reliable only when all the source bytes are ASCII. */
6235 check_ascii (struct coding_system
*coding
)
6237 const unsigned char *src
, *end
;
6238 Lisp_Object eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6239 int eol_seen
= coding
->eol_seen
;
6241 coding_set_source (coding
);
6242 src
= coding
->source
;
6243 end
= src
+ coding
->src_bytes
;
6245 if (inhibit_eol_conversion
6246 || SYMBOLP (eol_type
))
6248 /* We don't have to check EOL format. */
6249 while (src
< end
&& !( *src
& 0x80))
6252 eol_seen
|= EOL_SEEN_LF
;
6257 end
--; /* We look ahead one byte for "CR LF". */
6269 eol_seen
|= EOL_SEEN_CRLF
;
6273 eol_seen
|= EOL_SEEN_CR
;
6276 eol_seen
|= EOL_SEEN_LF
;
6282 /* All bytes but the last one C are ASCII. */
6286 eol_seen
|= EOL_SEEN_CR
;
6288 eol_seen
|= EOL_SEEN_LF
;
6293 coding
->head_ascii
= src
- coding
->source
;
6294 coding
->eol_seen
= eol_seen
;
6295 return (coding
->head_ascii
);
6299 /* Return the number of characters at the source if all the bytes are
6300 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6301 effects, update coding->eol_seen. The value of coding->eol_seen is
6302 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6303 the value is reliable only when all the source bytes are valid
6307 check_utf_8 (struct coding_system
*coding
)
6309 const unsigned char *src
, *end
;
6311 ptrdiff_t nchars
= coding
->head_ascii
;
6313 if (coding
->head_ascii
< 0)
6314 check_ascii (coding
);
6316 coding_set_source (coding
);
6317 src
= coding
->source
+ coding
->head_ascii
;
6318 /* We look ahead one byte for CR LF. */
6319 end
= coding
->source
+ coding
->src_bytes
- 1;
6320 eol_seen
= coding
->eol_seen
;
6325 if (UTF_8_1_OCTET_P (*src
))
6334 eol_seen
|= EOL_SEEN_CRLF
;
6339 eol_seen
|= EOL_SEEN_CR
;
6342 eol_seen
|= EOL_SEEN_LF
;
6345 else if (UTF_8_2_OCTET_LEADING_P (c
))
6347 if (c
< 0xC2 /* overlong sequence */
6349 || ! UTF_8_EXTRA_OCTET_P (src
[1]))
6353 else if (UTF_8_3_OCTET_LEADING_P (c
))
6356 || ! (UTF_8_EXTRA_OCTET_P (src
[1])
6357 && UTF_8_EXTRA_OCTET_P (src
[2])))
6359 c
= (((c
& 0xF) << 12)
6360 | ((src
[1] & 0x3F) << 6) | (src
[2] & 0x3F));
6361 if (c
< 0x800 /* overlong sequence */
6362 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
6366 else if (UTF_8_4_OCTET_LEADING_P (c
))
6369 || ! (UTF_8_EXTRA_OCTET_P (src
[1])
6370 && UTF_8_EXTRA_OCTET_P (src
[2])
6371 && UTF_8_EXTRA_OCTET_P (src
[3])))
6373 c
= (((c
& 0x7) << 18) | ((src
[1] & 0x3F) << 12)
6374 | ((src
[2] & 0x3F) << 6) | (src
[3] & 0x3F));
6375 if (c
< 0x10000 /* overlong sequence */
6376 || c
>= 0x110000) /* non-Unicode character */
6387 if (! UTF_8_1_OCTET_P (*src
))
6391 eol_seen
|= EOL_SEEN_CR
;
6392 else if (*src
== '\n')
6393 eol_seen
|= EOL_SEEN_LF
;
6395 coding
->eol_seen
= eol_seen
;
6400 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6401 SOURCE is encoded. If CATEGORY is one of
6402 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6403 two-byte, else they are encoded by one-byte.
6405 Return one of EOL_SEEN_XXX. */
6407 #define MAX_EOL_CHECK_COUNT 3
6410 detect_eol (const unsigned char *source
, ptrdiff_t src_bytes
,
6411 enum coding_category category
)
6413 const unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
6416 int eol_seen
= EOL_SEEN_NONE
;
6418 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
6420 bool msb
= category
== (coding_category_utf_16_le
6421 | coding_category_utf_16_le_nosig
);
6424 while (src
+ 1 < src_end
)
6427 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
6432 this_eol
= EOL_SEEN_LF
;
6433 else if (src
+ 3 >= src_end
6434 || src
[msb
+ 2] != 0
6435 || src
[lsb
+ 2] != '\n')
6436 this_eol
= EOL_SEEN_CR
;
6439 this_eol
= EOL_SEEN_CRLF
;
6443 if (eol_seen
== EOL_SEEN_NONE
)
6444 /* This is the first end-of-line. */
6445 eol_seen
= this_eol
;
6446 else if (eol_seen
!= this_eol
)
6448 /* The found type is different from what found before.
6449 Allow for stray ^M characters in DOS EOL files. */
6450 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6451 || (eol_seen
== EOL_SEEN_CRLF
6452 && this_eol
== EOL_SEEN_CR
))
6453 eol_seen
= EOL_SEEN_CRLF
;
6456 eol_seen
= EOL_SEEN_LF
;
6460 if (++total
== MAX_EOL_CHECK_COUNT
)
6467 while (src
< src_end
)
6470 if (c
== '\n' || c
== '\r')
6475 this_eol
= EOL_SEEN_LF
;
6476 else if (src
>= src_end
|| *src
!= '\n')
6477 this_eol
= EOL_SEEN_CR
;
6479 this_eol
= EOL_SEEN_CRLF
, src
++;
6481 if (eol_seen
== EOL_SEEN_NONE
)
6482 /* This is the first end-of-line. */
6483 eol_seen
= this_eol
;
6484 else if (eol_seen
!= this_eol
)
6486 /* The found type is different from what found before.
6487 Allow for stray ^M characters in DOS EOL files. */
6488 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6489 || (eol_seen
== EOL_SEEN_CRLF
&& this_eol
== EOL_SEEN_CR
))
6490 eol_seen
= EOL_SEEN_CRLF
;
6493 eol_seen
= EOL_SEEN_LF
;
6497 if (++total
== MAX_EOL_CHECK_COUNT
)
6506 adjust_coding_eol_type (struct coding_system
*coding
, int eol_seen
)
6508 Lisp_Object eol_type
;
6510 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6511 if (! VECTORP (eol_type
))
6512 /* Already adjusted. */
6514 if (eol_seen
& EOL_SEEN_LF
)
6516 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
6519 else if (eol_seen
& EOL_SEEN_CRLF
)
6521 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
6524 else if (eol_seen
& EOL_SEEN_CR
)
6526 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
6532 /* Detect how a text specified in CODING is encoded. If a coding
6533 system is detected, update fields of CODING by the detected coding
6537 detect_coding (struct coding_system
*coding
)
6539 const unsigned char *src
, *src_end
;
6540 unsigned int saved_mode
= coding
->mode
;
6541 Lisp_Object found
= Qnil
;
6542 Lisp_Object eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6544 coding
->consumed
= coding
->consumed_char
= 0;
6545 coding
->produced
= coding
->produced_char
= 0;
6546 coding_set_source (coding
);
6548 src_end
= coding
->source
+ coding
->src_bytes
;
6550 coding
->eol_seen
= EOL_SEEN_NONE
;
6551 /* If we have not yet decided the text encoding type, detect it
6553 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
6556 struct coding_detection_info detect_info
;
6557 bool null_byte_found
= 0, eight_bit_found
= 0;
6558 bool inhibit_nbd
= inhibit_flag (coding
->spec
.undecided
.inhibit_nbd
,
6559 inhibit_null_byte_detection
);
6560 bool inhibit_ied
= inhibit_flag (coding
->spec
.undecided
.inhibit_ied
,
6561 inhibit_iso_escape_detection
);
6562 bool prefer_utf_8
= coding
->spec
.undecided
.prefer_utf_8
;
6564 coding
->head_ascii
= 0;
6565 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6566 for (src
= coding
->source
; src
< src_end
; src
++)
6571 eight_bit_found
= 1;
6572 if (null_byte_found
)
6577 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
6579 && ! detect_info
.checked
)
6581 if (detect_coding_iso_2022 (coding
, &detect_info
))
6583 /* We have scanned the whole data. */
6584 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
6586 /* We didn't find an 8-bit code. We may
6587 have found a null-byte, but it's very
6588 rare that a binary file conforms to
6591 coding
->head_ascii
= src
- coding
->source
;
6593 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
6597 else if (! c
&& !inhibit_nbd
)
6599 null_byte_found
= 1;
6600 if (eight_bit_found
)
6603 else if (! disable_ascii_optimization
6604 && ! inhibit_eol_conversion
)
6608 if (src
< src_end
&& src
[1] == '\n')
6610 coding
->eol_seen
|= EOL_SEEN_CRLF
;
6612 if (! eight_bit_found
)
6613 coding
->head_ascii
++;
6616 coding
->eol_seen
|= EOL_SEEN_CR
;
6620 coding
->eol_seen
|= EOL_SEEN_LF
;
6624 if (! eight_bit_found
)
6625 coding
->head_ascii
++;
6627 else if (! eight_bit_found
)
6628 coding
->head_ascii
++;
6631 if (null_byte_found
|| eight_bit_found
6632 || coding
->head_ascii
< coding
->src_bytes
6633 || detect_info
.found
)
6635 enum coding_category category
;
6636 struct coding_system
*this;
6638 if (coding
->head_ascii
== coding
->src_bytes
)
6639 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6640 for (i
= 0; i
< coding_category_raw_text
; i
++)
6642 category
= coding_priorities
[i
];
6643 this = coding_categories
+ category
;
6644 if (detect_info
.found
& (1 << category
))
6649 if (null_byte_found
)
6651 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
6652 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
6654 else if (prefer_utf_8
6655 && detect_coding_utf_8 (coding
, &detect_info
))
6657 detect_info
.checked
|= ~CATEGORY_MASK_UTF_8
;
6658 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_8
;
6660 for (i
= 0; i
< coding_category_raw_text
; i
++)
6662 category
= coding_priorities
[i
];
6663 this = coding_categories
+ category
;
6664 /* Some of this->detector (e.g. detect_coding_sjis)
6665 require this information. */
6666 coding
->id
= this->id
;
6669 /* No coding system of this category is defined. */
6670 detect_info
.rejected
|= (1 << category
);
6672 else if (category
>= coding_category_raw_text
)
6674 else if (detect_info
.checked
& (1 << category
))
6676 if (detect_info
.found
& (1 << category
))
6679 else if ((*(this->detector
)) (coding
, &detect_info
)
6680 && detect_info
.found
& (1 << category
))
6685 if (i
< coding_category_raw_text
)
6687 if (category
== coding_category_utf_8_auto
)
6689 Lisp_Object coding_systems
;
6691 coding_systems
= AREF (CODING_ID_ATTRS (this->id
),
6692 coding_attr_utf_bom
);
6693 if (CONSP (coding_systems
))
6695 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
6696 found
= XCAR (coding_systems
);
6698 found
= XCDR (coding_systems
);
6701 found
= CODING_ID_NAME (this->id
);
6703 else if (category
== coding_category_utf_16_auto
)
6705 Lisp_Object coding_systems
;
6707 coding_systems
= AREF (CODING_ID_ATTRS (this->id
),
6708 coding_attr_utf_bom
);
6709 if (CONSP (coding_systems
))
6711 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6712 found
= XCAR (coding_systems
);
6713 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6714 found
= XCDR (coding_systems
);
6717 found
= CODING_ID_NAME (this->id
);
6720 found
= CODING_ID_NAME (this->id
);
6722 else if (null_byte_found
)
6723 found
= Qno_conversion
;
6724 else if ((detect_info
.rejected
& CATEGORY_MASK_ANY
)
6725 == CATEGORY_MASK_ANY
)
6727 else if (detect_info
.rejected
)
6728 for (i
= 0; i
< coding_category_raw_text
; i
++)
6729 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6731 this = coding_categories
+ coding_priorities
[i
];
6732 found
= CODING_ID_NAME (this->id
);
6737 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6738 == coding_category_utf_8_auto
)
6740 Lisp_Object coding_systems
;
6741 struct coding_detection_info detect_info
;
6744 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6745 detect_info
.found
= detect_info
.rejected
= 0;
6746 if (check_ascii (coding
) == coding
->src_bytes
)
6748 if (CONSP (coding_systems
))
6749 found
= XCDR (coding_systems
);
6753 if (CONSP (coding_systems
)
6754 && detect_coding_utf_8 (coding
, &detect_info
))
6756 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
6757 found
= XCAR (coding_systems
);
6759 found
= XCDR (coding_systems
);
6763 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6764 == coding_category_utf_16_auto
)
6766 Lisp_Object coding_systems
;
6767 struct coding_detection_info detect_info
;
6770 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6771 detect_info
.found
= detect_info
.rejected
= 0;
6772 coding
->head_ascii
= 0;
6773 if (CONSP (coding_systems
)
6774 && detect_coding_utf_16 (coding
, &detect_info
))
6776 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6777 found
= XCAR (coding_systems
);
6778 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6779 found
= XCDR (coding_systems
);
6785 int specified_eol
= (VECTORP (eol_type
) ? EOL_SEEN_NONE
6786 : EQ (eol_type
, Qdos
) ? EOL_SEEN_CRLF
6787 : EQ (eol_type
, Qmac
) ? EOL_SEEN_CR
6790 setup_coding_system (found
, coding
);
6791 if (specified_eol
!= EOL_SEEN_NONE
)
6792 adjust_coding_eol_type (coding
, specified_eol
);
6795 coding
->mode
= saved_mode
;
6800 decode_eol (struct coding_system
*coding
)
6802 Lisp_Object eol_type
;
6803 unsigned char *p
, *pbeg
, *pend
;
6805 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6806 if (EQ (eol_type
, Qunix
) || inhibit_eol_conversion
)
6809 if (NILP (coding
->dst_object
))
6810 pbeg
= coding
->destination
;
6812 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
6813 pend
= pbeg
+ coding
->produced
;
6815 if (VECTORP (eol_type
))
6817 int eol_seen
= EOL_SEEN_NONE
;
6819 for (p
= pbeg
; p
< pend
; p
++)
6822 eol_seen
|= EOL_SEEN_LF
;
6823 else if (*p
== '\r')
6825 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
6827 eol_seen
|= EOL_SEEN_CRLF
;
6831 eol_seen
|= EOL_SEEN_CR
;
6834 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6835 if ((eol_seen
& EOL_SEEN_CRLF
) != 0
6836 && (eol_seen
& EOL_SEEN_CR
) != 0
6837 && (eol_seen
& EOL_SEEN_LF
) == 0)
6838 eol_seen
= EOL_SEEN_CRLF
;
6839 else if (eol_seen
!= EOL_SEEN_NONE
6840 && eol_seen
!= EOL_SEEN_LF
6841 && eol_seen
!= EOL_SEEN_CRLF
6842 && eol_seen
!= EOL_SEEN_CR
)
6843 eol_seen
= EOL_SEEN_LF
;
6844 if (eol_seen
!= EOL_SEEN_NONE
)
6845 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
6848 if (EQ (eol_type
, Qmac
))
6850 for (p
= pbeg
; p
< pend
; p
++)
6854 else if (EQ (eol_type
, Qdos
))
6858 if (NILP (coding
->dst_object
))
6860 /* Start deleting '\r' from the tail to minimize the memory
6862 for (p
= pend
- 2; p
>= pbeg
; p
--)
6865 memmove (p
, p
+ 1, pend
-- - p
- 1);
6871 ptrdiff_t pos_byte
= coding
->dst_pos_byte
;
6872 ptrdiff_t pos
= coding
->dst_pos
;
6873 ptrdiff_t pos_end
= pos
+ coding
->produced_char
- 1;
6875 while (pos
< pos_end
)
6877 p
= BYTE_POS_ADDR (pos_byte
);
6878 if (*p
== '\r' && p
[1] == '\n')
6880 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
6885 if (coding
->dst_multibyte
)
6886 pos_byte
+= BYTES_BY_CHAR_HEAD (*p
);
6891 coding
->produced
-= n
;
6892 coding
->produced_char
-= n
;
6897 /* Return a translation table (or list of them) from coding system
6898 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6902 get_translation_table (Lisp_Object attrs
, bool encodep
, int *max_lookup
)
6904 Lisp_Object standard
, translation_table
;
6907 if (NILP (Venable_character_translation
))
6914 translation_table
= CODING_ATTR_ENCODE_TBL (attrs
),
6915 standard
= Vstandard_translation_table_for_encode
;
6917 translation_table
= CODING_ATTR_DECODE_TBL (attrs
),
6918 standard
= Vstandard_translation_table_for_decode
;
6919 if (NILP (translation_table
))
6920 translation_table
= standard
;
6923 if (SYMBOLP (translation_table
))
6924 translation_table
= Fget (translation_table
, Qtranslation_table
);
6925 else if (CONSP (translation_table
))
6927 translation_table
= Fcopy_sequence (translation_table
);
6928 for (val
= translation_table
; CONSP (val
); val
= XCDR (val
))
6929 if (SYMBOLP (XCAR (val
)))
6930 XSETCAR (val
, Fget (XCAR (val
), Qtranslation_table
));
6932 if (CHAR_TABLE_P (standard
))
6934 if (CONSP (translation_table
))
6935 translation_table
= nconc2 (translation_table
, list1 (standard
));
6937 translation_table
= list2 (translation_table
, standard
);
6944 if (CHAR_TABLE_P (translation_table
)
6945 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table
)) > 1)
6947 val
= XCHAR_TABLE (translation_table
)->extras
[1];
6948 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
6949 *max_lookup
= XFASTINT (val
);
6951 else if (CONSP (translation_table
))
6955 for (tail
= translation_table
; CONSP (tail
); tail
= XCDR (tail
))
6956 if (CHAR_TABLE_P (XCAR (tail
))
6957 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail
))) > 1)
6959 Lisp_Object tailval
= XCHAR_TABLE (XCAR (tail
))->extras
[1];
6960 if (NATNUMP (tailval
) && *max_lookup
< XFASTINT (tailval
))
6961 *max_lookup
= XFASTINT (tailval
);
6965 return translation_table
;
6968 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6971 if (CHAR_TABLE_P (table)) \
6973 trans = CHAR_TABLE_REF (table, c); \
6974 if (CHARACTERP (trans)) \
6975 c = XFASTINT (trans), trans = Qnil; \
6977 else if (CONSP (table)) \
6981 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6982 if (CHAR_TABLE_P (XCAR (tail))) \
6984 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6985 if (CHARACTERP (trans)) \
6986 c = XFASTINT (trans), trans = Qnil; \
6987 else if (! NILP (trans)) \
6994 /* Return a translation of character(s) at BUF according to TRANS.
6995 TRANS is TO-CHAR or ((FROM . TO) ...) where
6996 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6997 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6998 translation is found, and Qnil if not found..
6999 If BUF is too short to lookup characters in FROM, return Qt. */
7002 get_translation (Lisp_Object trans
, int *buf
, int *buf_end
)
7005 if (INTEGERP (trans
))
7007 for (; CONSP (trans
); trans
= XCDR (trans
))
7009 Lisp_Object val
= XCAR (trans
);
7010 Lisp_Object from
= XCAR (val
);
7011 ptrdiff_t len
= ASIZE (from
);
7014 for (i
= 0; i
< len
; i
++)
7016 if (buf
+ i
== buf_end
)
7018 if (XINT (AREF (from
, i
)) != buf
[i
])
7029 produce_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
7032 unsigned char *dst
= coding
->destination
+ coding
->produced
;
7033 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
7035 ptrdiff_t produced_chars
= 0;
7038 if (! coding
->chars_at_source
)
7040 /* Source characters are in coding->charbuf. */
7041 int *buf
= coding
->charbuf
;
7042 int *buf_end
= buf
+ coding
->charbuf_used
;
7044 if (EQ (coding
->src_object
, coding
->dst_object
))
7046 coding_set_source (coding
);
7047 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
7050 while (buf
< buf_end
)
7057 ptrdiff_t from_nchars
= 1, to_nchars
= 1;
7058 Lisp_Object trans
= Qnil
;
7060 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
7063 trans
= get_translation (trans
, buf
, buf_end
);
7064 if (INTEGERP (trans
))
7066 else if (CONSP (trans
))
7068 from_nchars
= ASIZE (XCAR (trans
));
7069 trans
= XCDR (trans
);
7070 if (INTEGERP (trans
))
7074 to_nchars
= ASIZE (trans
);
7075 c
= XINT (AREF (trans
, 0));
7078 else if (EQ (trans
, Qt
) && ! last_block
)
7082 if ((dst_end
- dst
) / MAX_MULTIBYTE_LENGTH
< to_nchars
)
7084 if (((min (PTRDIFF_MAX
, SIZE_MAX
) - (buf_end
- buf
))
7085 / MAX_MULTIBYTE_LENGTH
)
7087 memory_full (SIZE_MAX
);
7088 dst
= alloc_destination (coding
,
7090 + MAX_MULTIBYTE_LENGTH
* to_nchars
,
7092 if (EQ (coding
->src_object
, coding
->dst_object
))
7094 coding_set_source (coding
);
7095 dst_end
= (((unsigned char *) coding
->source
)
7096 + coding
->consumed
);
7099 dst_end
= coding
->destination
+ coding
->dst_bytes
;
7102 for (i
= 0; i
< to_nchars
; i
++)
7105 c
= XINT (AREF (trans
, i
));
7106 if (coding
->dst_multibyte
7107 || ! CHAR_BYTE8_P (c
))
7108 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
7110 *dst
++ = CHAR_TO_BYTE8 (c
);
7112 produced_chars
+= to_nchars
;
7116 /* This is an annotation datum. (-C) is the length. */
7119 carryover
= buf_end
- buf
;
7123 /* Source characters are at coding->source. */
7124 const unsigned char *src
= coding
->source
;
7125 const unsigned char *src_end
= src
+ coding
->consumed
;
7127 if (EQ (coding
->dst_object
, coding
->src_object
))
7128 dst_end
= (unsigned char *) src
;
7129 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
7131 if (coding
->src_multibyte
)
7133 bool multibytep
= 1;
7134 ptrdiff_t consumed_chars
= 0;
7138 const unsigned char *src_base
= src
;
7144 if (EQ (coding
->src_object
, coding
->dst_object
))
7145 dst_end
= (unsigned char *) src
;
7148 ptrdiff_t offset
= src
- coding
->source
;
7150 dst
= alloc_destination (coding
, src_end
- src
+ 1,
7152 dst_end
= coding
->destination
+ coding
->dst_bytes
;
7153 coding_set_source (coding
);
7154 src
= coding
->source
+ offset
;
7155 src_end
= coding
->source
+ coding
->consumed
;
7156 if (EQ (coding
->src_object
, coding
->dst_object
))
7157 dst_end
= (unsigned char *) src
;
7167 while (src
< src_end
)
7169 bool multibytep
= 1;
7172 if (dst
>= dst_end
- 1)
7174 if (EQ (coding
->src_object
, coding
->dst_object
))
7175 dst_end
= (unsigned char *) src
;
7176 if (dst
>= dst_end
- 1)
7178 ptrdiff_t offset
= src
- coding
->source
;
7179 ptrdiff_t more_bytes
;
7181 if (EQ (coding
->src_object
, coding
->dst_object
))
7182 more_bytes
= ((src_end
- src
) / 2) + 2;
7184 more_bytes
= src_end
- src
+ 2;
7185 dst
= alloc_destination (coding
, more_bytes
, dst
);
7186 dst_end
= coding
->destination
+ coding
->dst_bytes
;
7187 coding_set_source (coding
);
7188 src
= coding
->source
+ offset
;
7189 src_end
= coding
->source
+ coding
->consumed
;
7190 if (EQ (coding
->src_object
, coding
->dst_object
))
7191 dst_end
= (unsigned char *) src
;
7199 if (!EQ (coding
->src_object
, coding
->dst_object
))
7201 ptrdiff_t require
= coding
->src_bytes
- coding
->dst_bytes
;
7205 ptrdiff_t offset
= src
- coding
->source
;
7207 dst
= alloc_destination (coding
, require
, dst
);
7208 coding_set_source (coding
);
7209 src
= coding
->source
+ offset
;
7210 src_end
= coding
->source
+ coding
->consumed
;
7213 produced_chars
= coding
->consumed_char
;
7214 while (src
< src_end
)
7219 produced
= dst
- (coding
->destination
+ coding
->produced
);
7220 if (BUFFERP (coding
->dst_object
) && produced_chars
> 0)
7221 insert_from_gap (produced_chars
, produced
, 0);
7222 coding
->produced
+= produced
;
7223 coding
->produced_char
+= produced_chars
;
7227 /* Compose text in CODING->object according to the annotation data at
7228 CHARBUF. CHARBUF is an array:
7229 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7233 produce_composition (struct coding_system
*coding
, int *charbuf
, ptrdiff_t pos
)
7237 enum composition_method method
;
7238 Lisp_Object components
;
7240 len
= -charbuf
[0] - MAX_ANNOTATION_LENGTH
;
7241 to
= pos
+ charbuf
[2];
7242 method
= (enum composition_method
) (charbuf
[4]);
7244 if (method
== COMPOSITION_RELATIVE
)
7248 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
7251 if (method
== COMPOSITION_WITH_RULE
)
7252 len
= charbuf
[2] * 3 - 2;
7253 charbuf
+= MAX_ANNOTATION_LENGTH
;
7254 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7255 for (i
= j
= 0; i
< len
&& charbuf
[i
] != -1; i
++, j
++)
7257 if (charbuf
[i
] >= 0)
7258 args
[j
] = make_number (charbuf
[i
]);
7262 args
[j
] = make_number (charbuf
[i
] % 0x100);
7265 components
= (i
== j
? Fstring (j
, args
) : Fvector (j
, args
));
7267 compose_text (pos
, to
, components
, Qnil
, coding
->dst_object
);
7271 /* Put `charset' property on text in CODING->object according to
7272 the annotation data at CHARBUF. CHARBUF is an array:
7273 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7277 produce_charset (struct coding_system
*coding
, int *charbuf
, ptrdiff_t pos
)
7279 ptrdiff_t from
= pos
- charbuf
[2];
7280 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[3]);
7282 Fput_text_property (make_number (from
), make_number (pos
),
7283 Qcharset
, CHARSET_NAME (charset
),
7284 coding
->dst_object
);
7288 #define CHARBUF_SIZE 0x4000
7290 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7292 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7293 coding->charbuf_size = CHARBUF_SIZE; \
7298 produce_annotation (struct coding_system
*coding
, ptrdiff_t pos
)
7300 int *charbuf
= coding
->charbuf
;
7301 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
7303 if (NILP (coding
->dst_object
))
7306 while (charbuf
< charbuf_end
)
7312 int len
= -*charbuf
;
7317 case CODING_ANNOTATE_COMPOSITION_MASK
:
7318 produce_composition (coding
, charbuf
, pos
);
7320 case CODING_ANNOTATE_CHARSET_MASK
:
7321 produce_charset (coding
, charbuf
, pos
);
7329 /* Decode the data at CODING->src_object into CODING->dst_object.
7330 CODING->src_object is a buffer, a string, or nil.
7331 CODING->dst_object is a buffer.
7333 If CODING->src_object is a buffer, it must be the current buffer.
7334 In this case, if CODING->src_pos is positive, it is a position of
7335 the source text in the buffer, otherwise, the source text is in the
7336 gap area of the buffer, and CODING->src_pos specifies the offset of
7337 the text from GPT (which must be the same as PT). If this is the
7338 same buffer as CODING->dst_object, CODING->src_pos must be
7341 If CODING->src_object is a string, CODING->src_pos is an index to
7344 If CODING->src_object is nil, CODING->source must already point to
7345 the non-relocatable memory area. In this case, CODING->src_pos is
7346 an offset from CODING->source.
7348 The decoded data is inserted at the current point of the buffer
7353 decode_coding (struct coding_system
*coding
)
7356 Lisp_Object undo_list
;
7357 Lisp_Object translation_table
;
7358 struct ccl_spec cclspec
;
7364 if (BUFFERP (coding
->src_object
)
7365 && coding
->src_pos
> 0
7366 && coding
->src_pos
< GPT
7367 && coding
->src_pos
+ coding
->src_chars
> GPT
)
7368 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
7371 if (BUFFERP (coding
->dst_object
))
7373 set_buffer_internal (XBUFFER (coding
->dst_object
));
7375 move_gap_both (PT
, PT_BYTE
);
7377 /* We must disable undo_list in order to record the whole insert
7378 transaction via record_insert at the end. But doing so also
7379 disables the recording of the first change to the undo_list.
7380 Therefore we check for first change here and record it via
7381 record_first_change if needed. */
7382 if (MODIFF
<= SAVE_MODIFF
)
7383 record_first_change ();
7385 undo_list
= BVAR (current_buffer
, undo_list
);
7386 bset_undo_list (current_buffer
, Qt
);
7389 coding
->consumed
= coding
->consumed_char
= 0;
7390 coding
->produced
= coding
->produced_char
= 0;
7391 coding
->chars_at_source
= 0;
7392 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7395 ALLOC_CONVERSION_WORK_AREA (coding
);
7397 attrs
= CODING_ID_ATTRS (coding
->id
);
7398 translation_table
= get_translation_table (attrs
, 0, NULL
);
7401 if (coding
->decoder
== decode_coding_ccl
)
7403 coding
->spec
.ccl
= &cclspec
;
7404 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_DECODER (coding
));
7408 ptrdiff_t pos
= coding
->dst_pos
+ coding
->produced_char
;
7410 coding_set_source (coding
);
7411 coding
->annotated
= 0;
7412 coding
->charbuf_used
= carryover
;
7413 (*(coding
->decoder
)) (coding
);
7414 coding_set_destination (coding
);
7415 carryover
= produce_chars (coding
, translation_table
, 0);
7416 if (coding
->annotated
)
7417 produce_annotation (coding
, pos
);
7418 for (i
= 0; i
< carryover
; i
++)
7420 = coding
->charbuf
[coding
->charbuf_used
- carryover
+ i
];
7422 while (coding
->result
== CODING_RESULT_INSUFFICIENT_DST
7423 || (coding
->consumed
< coding
->src_bytes
7424 && (coding
->result
== CODING_RESULT_SUCCESS
7425 || coding
->result
== CODING_RESULT_INVALID_SRC
)));
7429 coding_set_destination (coding
);
7430 coding
->charbuf_used
= carryover
;
7431 produce_chars (coding
, translation_table
, 1);
7434 coding
->carryover_bytes
= 0;
7435 if (coding
->consumed
< coding
->src_bytes
)
7437 ptrdiff_t nbytes
= coding
->src_bytes
- coding
->consumed
;
7438 const unsigned char *src
;
7440 coding_set_source (coding
);
7441 coding_set_destination (coding
);
7442 src
= coding
->source
+ coding
->consumed
;
7444 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
7446 /* Flush out unprocessed data as binary chars. We are sure
7447 that the number of data is less than the size of
7449 coding
->charbuf_used
= 0;
7450 coding
->chars_at_source
= 0;
7452 while (nbytes
-- > 0)
7457 c
= BYTE8_TO_CHAR (c
);
7458 coding
->charbuf
[coding
->charbuf_used
++] = c
;
7460 produce_chars (coding
, Qnil
, 1);
7464 /* Record unprocessed bytes in coding->carryover. We are
7465 sure that the number of data is less than the size of
7466 coding->carryover. */
7467 unsigned char *p
= coding
->carryover
;
7469 if (nbytes
> sizeof coding
->carryover
)
7470 nbytes
= sizeof coding
->carryover
;
7471 coding
->carryover_bytes
= nbytes
;
7472 while (nbytes
-- > 0)
7475 coding
->consumed
= coding
->src_bytes
;
7478 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
)
7479 && !inhibit_eol_conversion
)
7480 decode_eol (coding
);
7481 if (BUFFERP (coding
->dst_object
))
7483 bset_undo_list (current_buffer
, undo_list
);
7484 record_insert (coding
->dst_pos
, coding
->produced_char
);
7491 /* Extract an annotation datum from a composition starting at POS and
7492 ending before LIMIT of CODING->src_object (buffer or string), store
7493 the data in BUF, set *STOP to a starting position of the next
7494 composition (if any) or to LIMIT, and return the address of the
7495 next element of BUF.
7497 If such an annotation is not found, set *STOP to a starting
7498 position of a composition after POS (if any) or to LIMIT, and
7502 handle_composition_annotation (ptrdiff_t pos
, ptrdiff_t limit
,
7503 struct coding_system
*coding
, int *buf
,
7506 ptrdiff_t start
, end
;
7509 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
7512 else if (start
> pos
)
7518 /* We found a composition. Store the corresponding
7519 annotation data in BUF. */
7521 enum composition_method method
= composition_method (prop
);
7522 int nchars
= COMPOSITION_LENGTH (prop
);
7524 ADD_COMPOSITION_DATA (buf
, nchars
, 0, method
);
7525 if (method
!= COMPOSITION_RELATIVE
)
7527 Lisp_Object components
;
7528 ptrdiff_t i
, len
, i_byte
;
7530 components
= COMPOSITION_COMPONENTS (prop
);
7531 if (VECTORP (components
))
7533 len
= ASIZE (components
);
7534 for (i
= 0; i
< len
; i
++)
7535 *buf
++ = XINT (AREF (components
, i
));
7537 else if (STRINGP (components
))
7539 len
= SCHARS (components
);
7543 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
7547 else if (INTEGERP (components
))
7550 *buf
++ = XINT (components
);
7552 else if (CONSP (components
))
7554 for (len
= 0; CONSP (components
);
7555 len
++, components
= XCDR (components
))
7556 *buf
++ = XINT (XCAR (components
));
7564 if (find_composition (end
, limit
, &start
, &end
, &prop
,
7575 /* Extract an annotation datum from a text property `charset' at POS of
7576 CODING->src_object (buffer of string), store the data in BUF, set
7577 *STOP to the position where the value of `charset' property changes
7578 (limiting by LIMIT), and return the address of the next element of
7581 If the property value is nil, set *STOP to the position where the
7582 property value is non-nil (limiting by LIMIT), and return BUF. */
7585 handle_charset_annotation (ptrdiff_t pos
, ptrdiff_t limit
,
7586 struct coding_system
*coding
, int *buf
,
7589 Lisp_Object val
, next
;
7592 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
7593 if (! NILP (val
) && CHARSETP (val
))
7594 id
= XINT (CHARSET_SYMBOL_ID (val
));
7597 ADD_CHARSET_DATA (buf
, 0, id
);
7598 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
7600 make_number (limit
));
7601 *stop
= XINT (next
);
7607 consume_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
7610 int *buf
= coding
->charbuf
;
7611 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
7612 const unsigned char *src
= coding
->source
+ coding
->consumed
;
7613 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
7614 ptrdiff_t pos
= coding
->src_pos
+ coding
->consumed_char
;
7615 ptrdiff_t end_pos
= coding
->src_pos
+ coding
->src_chars
;
7616 bool multibytep
= coding
->src_multibyte
;
7617 Lisp_Object eol_type
;
7619 ptrdiff_t stop
, stop_composition
, stop_charset
;
7620 int *lookup_buf
= NULL
;
7622 if (! NILP (translation_table
))
7623 lookup_buf
= alloca (sizeof (int) * max_lookup
);
7625 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
7626 if (VECTORP (eol_type
))
7629 /* Note: composition handling is not yet implemented. */
7630 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7632 if (NILP (coding
->src_object
))
7633 stop
= stop_composition
= stop_charset
= end_pos
;
7636 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
7637 stop
= stop_composition
= pos
;
7639 stop
= stop_composition
= end_pos
;
7640 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
7641 stop
= stop_charset
= pos
;
7643 stop_charset
= end_pos
;
7646 /* Compensate for CRLF and conversion. */
7647 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
7648 while (buf
< buf_end
)
7656 if (pos
== stop_composition
)
7657 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
7658 buf
, &stop_composition
);
7659 if (pos
== stop_charset
)
7660 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
7661 buf
, &stop_charset
);
7662 stop
= (stop_composition
< stop_charset
7663 ? stop_composition
: stop_charset
);
7670 if (coding
->encoder
== encode_coding_raw_text
7671 || coding
->encoder
== encode_coding_ccl
)
7673 else if ((bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
7674 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
+= bytes
;
7676 c
= BYTE8_TO_CHAR (*src
), src
++, pos
++;
7679 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
++;
7680 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
7682 if (! EQ (eol_type
, Qunix
))
7686 if (EQ (eol_type
, Qdos
))
7694 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
7699 ptrdiff_t from_nchars
= 1, to_nchars
= 1;
7700 int *lookup_buf_end
;
7701 const unsigned char *p
= src
;
7705 for (i
= 1; i
< max_lookup
&& p
< src_end
; i
++)
7706 lookup_buf
[i
] = STRING_CHAR_ADVANCE (p
);
7707 lookup_buf_end
= lookup_buf
+ i
;
7708 trans
= get_translation (trans
, lookup_buf
, lookup_buf_end
);
7709 if (INTEGERP (trans
))
7711 else if (CONSP (trans
))
7713 from_nchars
= ASIZE (XCAR (trans
));
7714 trans
= XCDR (trans
);
7715 if (INTEGERP (trans
))
7719 to_nchars
= ASIZE (trans
);
7720 if (buf_end
- buf
< to_nchars
)
7722 c
= XINT (AREF (trans
, 0));
7728 for (i
= 1; i
< to_nchars
; i
++)
7729 *buf
++ = XINT (AREF (trans
, i
));
7730 for (i
= 1; i
< from_nchars
; i
++, pos
++)
7731 src
+= MULTIBYTE_LENGTH_NO_CHECK (src
);
7735 coding
->consumed
= src
- coding
->source
;
7736 coding
->consumed_char
= pos
- coding
->src_pos
;
7737 coding
->charbuf_used
= buf
- coding
->charbuf
;
7738 coding
->chars_at_source
= 0;
7742 /* Encode the text at CODING->src_object into CODING->dst_object.
7743 CODING->src_object is a buffer or a string.
7744 CODING->dst_object is a buffer or nil.
7746 If CODING->src_object is a buffer, it must be the current buffer.
7747 In this case, if CODING->src_pos is positive, it is a position of
7748 the source text in the buffer, otherwise. the source text is in the
7749 gap area of the buffer, and coding->src_pos specifies the offset of
7750 the text from GPT (which must be the same as PT). If this is the
7751 same buffer as CODING->dst_object, CODING->src_pos must be
7752 negative and CODING should not have `pre-write-conversion'.
7754 If CODING->src_object is a string, CODING should not have
7755 `pre-write-conversion'.
7757 If CODING->dst_object is a buffer, the encoded data is inserted at
7758 the current point of that buffer.
7760 If CODING->dst_object is nil, the encoded data is placed at the
7761 memory area specified by CODING->destination. */
7764 encode_coding (struct coding_system
*coding
)
7767 Lisp_Object translation_table
;
7769 struct ccl_spec cclspec
;
7773 attrs
= CODING_ID_ATTRS (coding
->id
);
7774 if (coding
->encoder
== encode_coding_raw_text
)
7775 translation_table
= Qnil
, max_lookup
= 0;
7777 translation_table
= get_translation_table (attrs
, 1, &max_lookup
);
7779 if (BUFFERP (coding
->dst_object
))
7781 set_buffer_internal (XBUFFER (coding
->dst_object
));
7782 coding
->dst_multibyte
7783 = ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7786 coding
->consumed
= coding
->consumed_char
= 0;
7787 coding
->produced
= coding
->produced_char
= 0;
7788 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7791 ALLOC_CONVERSION_WORK_AREA (coding
);
7793 if (coding
->encoder
== encode_coding_ccl
)
7795 coding
->spec
.ccl
= &cclspec
;
7796 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_ENCODER (coding
));
7799 coding_set_source (coding
);
7800 consume_chars (coding
, translation_table
, max_lookup
);
7801 coding_set_destination (coding
);
7802 (*(coding
->encoder
)) (coding
);
7803 } while (coding
->consumed_char
< coding
->src_chars
);
7805 if (BUFFERP (coding
->dst_object
) && coding
->produced_char
> 0)
7806 insert_from_gap (coding
->produced_char
, coding
->produced
, 0);
7812 /* Name (or base name) of work buffer for code conversion. */
7813 static Lisp_Object Vcode_conversion_workbuf_name
;
7815 /* A working buffer used by the top level conversion. Once it is
7816 created, it is never destroyed. It has the name
7817 Vcode_conversion_workbuf_name. The other working buffers are
7818 destroyed after the use is finished, and their names are modified
7819 versions of Vcode_conversion_workbuf_name. */
7820 static Lisp_Object Vcode_conversion_reused_workbuf
;
7822 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7823 static bool reused_workbuf_in_use
;
7826 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7827 multibyteness of returning buffer. */
7830 make_conversion_work_buffer (bool multibyte
)
7832 Lisp_Object name
, workbuf
;
7833 struct buffer
*current
;
7835 if (reused_workbuf_in_use
)
7837 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
7838 workbuf
= Fget_buffer_create (name
);
7842 reused_workbuf_in_use
= 1;
7843 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf
)))
7844 Vcode_conversion_reused_workbuf
7845 = Fget_buffer_create (Vcode_conversion_workbuf_name
);
7846 workbuf
= Vcode_conversion_reused_workbuf
;
7848 current
= current_buffer
;
7849 set_buffer_internal (XBUFFER (workbuf
));
7850 /* We can't allow modification hooks to run in the work buffer. For
7851 instance, directory_files_internal assumes that file decoding
7852 doesn't compile new regexps. */
7853 Fset (Fmake_local_variable (Qinhibit_modification_hooks
), Qt
);
7855 bset_undo_list (current_buffer
, Qt
);
7856 bset_enable_multibyte_characters (current_buffer
, multibyte
? Qt
: Qnil
);
7857 set_buffer_internal (current
);
7863 code_conversion_restore (Lisp_Object arg
)
7865 Lisp_Object current
, workbuf
;
7866 struct gcpro gcpro1
;
7869 current
= XCAR (arg
);
7870 workbuf
= XCDR (arg
);
7871 if (! NILP (workbuf
))
7873 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
7874 reused_workbuf_in_use
= 0;
7876 Fkill_buffer (workbuf
);
7878 set_buffer_internal (XBUFFER (current
));
7883 code_conversion_save (bool with_work_buf
, bool multibyte
)
7885 Lisp_Object workbuf
= Qnil
;
7888 workbuf
= make_conversion_work_buffer (multibyte
);
7889 record_unwind_protect (code_conversion_restore
,
7890 Fcons (Fcurrent_buffer (), workbuf
));
7895 decode_coding_gap (struct coding_system
*coding
,
7896 ptrdiff_t chars
, ptrdiff_t bytes
)
7898 ptrdiff_t count
= SPECPDL_INDEX ();
7901 coding
->src_object
= Fcurrent_buffer ();
7902 coding
->src_chars
= chars
;
7903 coding
->src_bytes
= bytes
;
7904 coding
->src_pos
= -chars
;
7905 coding
->src_pos_byte
= -bytes
;
7906 coding
->src_multibyte
= chars
< bytes
;
7907 coding
->dst_object
= coding
->src_object
;
7908 coding
->dst_pos
= PT
;
7909 coding
->dst_pos_byte
= PT_BYTE
;
7910 coding
->dst_multibyte
= ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7912 coding
->head_ascii
= -1;
7913 coding
->detected_utf8_bytes
= coding
->detected_utf8_chars
= -1;
7914 coding
->eol_seen
= EOL_SEEN_NONE
;
7915 if (CODING_REQUIRE_DETECTION (coding
))
7916 detect_coding (coding
);
7917 attrs
= CODING_ID_ATTRS (coding
->id
);
7918 if (! disable_ascii_optimization
7919 && ! coding
->src_multibyte
7920 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
))
7921 && NILP (CODING_ATTR_POST_READ (attrs
))
7922 && NILP (get_translation_table (attrs
, 0, NULL
)))
7924 chars
= coding
->head_ascii
;
7926 chars
= check_ascii (coding
);
7929 /* There exists a non-ASCII byte. */
7930 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_8
)
7931 && coding
->detected_utf8_bytes
== coding
->src_bytes
)
7933 if (coding
->detected_utf8_chars
>= 0)
7934 chars
= coding
->detected_utf8_chars
;
7936 chars
= check_utf_8 (coding
);
7937 if (CODING_UTF_8_BOM (coding
) != utf_without_bom
7938 && coding
->head_ascii
== 0
7939 && coding
->source
[0] == UTF_8_BOM_1
7940 && coding
->source
[1] == UTF_8_BOM_2
7941 && coding
->source
[2] == UTF_8_BOM_3
)
7945 coding
->src_bytes
-= 3;
7953 Lisp_Object eol_type
;
7955 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
7956 if (VECTORP (eol_type
))
7958 if (coding
->eol_seen
!= EOL_SEEN_NONE
)
7959 eol_type
= adjust_coding_eol_type (coding
, coding
->eol_seen
);
7961 if (EQ (eol_type
, Qmac
))
7963 unsigned char *src_end
= GAP_END_ADDR
;
7964 unsigned char *src
= src_end
- coding
->src_bytes
;
7966 while (src
< src_end
)
7972 else if (EQ (eol_type
, Qdos
))
7974 unsigned char *src
= GAP_END_ADDR
;
7975 unsigned char *src_beg
= src
- coding
->src_bytes
;
7976 unsigned char *dst
= src
;
7979 while (src_beg
< src
)
7982 if (*src
== '\n' && src
> src_beg
&& src
[-1] == '\r')
7989 coding
->produced
= bytes
;
7990 coding
->produced_char
= chars
;
7991 insert_from_gap (chars
, bytes
, 1);
7995 code_conversion_save (0, 0);
7997 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
7998 current_buffer
->text
->inhibit_shrinking
= 1;
7999 decode_coding (coding
);
8000 current_buffer
->text
->inhibit_shrinking
= 0;
8002 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
8004 ptrdiff_t prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
8007 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
8008 val
= call1 (CODING_ATTR_POST_READ (attrs
),
8009 make_number (coding
->produced_char
));
8011 coding
->produced_char
+= Z
- prev_Z
;
8012 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
8015 unbind_to (count
, Qnil
);
8019 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8020 SRC_OBJECT into DST_OBJECT by coding context CODING.
8022 SRC_OBJECT is a buffer, a string, or Qnil.
8024 If it is a buffer, the text is at point of the buffer. FROM and TO
8025 are positions in the buffer.
8027 If it is a string, the text is at the beginning of the string.
8028 FROM and TO are indices to the string.
8030 If it is nil, the text is at coding->source. FROM and TO are
8031 indices to coding->source.
8033 DST_OBJECT is a buffer, Qt, or Qnil.
8035 If it is a buffer, the decoded text is inserted at point of the
8036 buffer. If the buffer is the same as SRC_OBJECT, the source text
8039 If it is Qt, a string is made from the decoded text, and
8040 set in CODING->dst_object.
8042 If it is Qnil, the decoded text is stored at CODING->destination.
8043 The caller must allocate CODING->dst_bytes bytes at
8044 CODING->destination by xmalloc. If the decoded text is longer than
8045 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8049 decode_coding_object (struct coding_system
*coding
,
8050 Lisp_Object src_object
,
8051 ptrdiff_t from
, ptrdiff_t from_byte
,
8052 ptrdiff_t to
, ptrdiff_t to_byte
,
8053 Lisp_Object dst_object
)
8055 ptrdiff_t count
= SPECPDL_INDEX ();
8056 unsigned char *destination
IF_LINT (= NULL
);
8057 ptrdiff_t dst_bytes
IF_LINT (= 0);
8058 ptrdiff_t chars
= to
- from
;
8059 ptrdiff_t bytes
= to_byte
- from_byte
;
8061 ptrdiff_t saved_pt
= -1, saved_pt_byte
IF_LINT (= 0);
8062 bool need_marker_adjustment
= 0;
8063 Lisp_Object old_deactivate_mark
;
8065 old_deactivate_mark
= Vdeactivate_mark
;
8067 if (NILP (dst_object
))
8069 destination
= coding
->destination
;
8070 dst_bytes
= coding
->dst_bytes
;
8073 coding
->src_object
= src_object
;
8074 coding
->src_chars
= chars
;
8075 coding
->src_bytes
= bytes
;
8076 coding
->src_multibyte
= chars
< bytes
;
8078 if (STRINGP (src_object
))
8080 coding
->src_pos
= from
;
8081 coding
->src_pos_byte
= from_byte
;
8083 else if (BUFFERP (src_object
))
8085 set_buffer_internal (XBUFFER (src_object
));
8087 move_gap_both (from
, from_byte
);
8088 if (EQ (src_object
, dst_object
))
8090 struct Lisp_Marker
*tail
;
8092 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
8094 tail
->need_adjustment
8095 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
8096 need_marker_adjustment
|= tail
->need_adjustment
;
8098 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
8099 TEMP_SET_PT_BOTH (from
, from_byte
);
8100 current_buffer
->text
->inhibit_shrinking
= 1;
8101 del_range_both (from
, from_byte
, to
, to_byte
, 1);
8102 coding
->src_pos
= -chars
;
8103 coding
->src_pos_byte
= -bytes
;
8107 coding
->src_pos
= from
;
8108 coding
->src_pos_byte
= from_byte
;
8112 if (CODING_REQUIRE_DETECTION (coding
))
8113 detect_coding (coding
);
8114 attrs
= CODING_ID_ATTRS (coding
->id
);
8116 if (EQ (dst_object
, Qt
)
8117 || (! NILP (CODING_ATTR_POST_READ (attrs
))
8118 && NILP (dst_object
)))
8120 coding
->dst_multibyte
= !CODING_FOR_UNIBYTE (coding
);
8121 coding
->dst_object
= code_conversion_save (1, coding
->dst_multibyte
);
8122 coding
->dst_pos
= BEG
;
8123 coding
->dst_pos_byte
= BEG_BYTE
;
8125 else if (BUFFERP (dst_object
))
8127 code_conversion_save (0, 0);
8128 coding
->dst_object
= dst_object
;
8129 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
8130 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
8131 coding
->dst_multibyte
8132 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
8136 code_conversion_save (0, 0);
8137 coding
->dst_object
= Qnil
;
8138 /* Most callers presume this will return a multibyte result, and they
8139 won't use `binary' or `raw-text' anyway, so let's not worry about
8140 CODING_FOR_UNIBYTE. */
8141 coding
->dst_multibyte
= 1;
8144 decode_coding (coding
);
8146 if (BUFFERP (coding
->dst_object
))
8147 set_buffer_internal (XBUFFER (coding
->dst_object
));
8149 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
8151 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
8152 ptrdiff_t prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
8155 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
8156 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
8157 old_deactivate_mark
);
8158 val
= safe_call1 (CODING_ATTR_POST_READ (attrs
),
8159 make_number (coding
->produced_char
));
8162 coding
->produced_char
+= Z
- prev_Z
;
8163 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
8166 if (EQ (dst_object
, Qt
))
8168 coding
->dst_object
= Fbuffer_string ();
8170 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
8172 set_buffer_internal (XBUFFER (coding
->dst_object
));
8173 if (dst_bytes
< coding
->produced
)
8175 eassert (coding
->produced
> 0);
8176 destination
= xrealloc (destination
, coding
->produced
);
8177 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
8178 move_gap_both (BEGV
, BEGV_BYTE
);
8179 memcpy (destination
, BEGV_ADDR
, coding
->produced
);
8180 coding
->destination
= destination
;
8186 /* This is the case of:
8187 (BUFFERP (src_object) && EQ (src_object, dst_object))
8188 As we have moved PT while replacing the original buffer
8189 contents, we must recover it now. */
8190 set_buffer_internal (XBUFFER (src_object
));
8191 current_buffer
->text
->inhibit_shrinking
= 0;
8192 if (saved_pt
< from
)
8193 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
8194 else if (saved_pt
< from
+ chars
)
8195 TEMP_SET_PT_BOTH (from
, from_byte
);
8196 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8197 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
8198 saved_pt_byte
+ (coding
->produced
- bytes
));
8200 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
8201 saved_pt_byte
+ (coding
->produced
- bytes
));
8203 if (need_marker_adjustment
)
8205 struct Lisp_Marker
*tail
;
8207 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
8208 if (tail
->need_adjustment
)
8210 tail
->need_adjustment
= 0;
8211 if (tail
->insertion_type
)
8213 tail
->bytepos
= from_byte
;
8214 tail
->charpos
= from
;
8218 tail
->bytepos
= from_byte
+ coding
->produced
;
8220 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
8221 ? tail
->bytepos
: from
+ coding
->produced_char
);
8227 Vdeactivate_mark
= old_deactivate_mark
;
8228 unbind_to (count
, coding
->dst_object
);
8233 encode_coding_object (struct coding_system
*coding
,
8234 Lisp_Object src_object
,
8235 ptrdiff_t from
, ptrdiff_t from_byte
,
8236 ptrdiff_t to
, ptrdiff_t to_byte
,
8237 Lisp_Object dst_object
)
8239 ptrdiff_t count
= SPECPDL_INDEX ();
8240 ptrdiff_t chars
= to
- from
;
8241 ptrdiff_t bytes
= to_byte
- from_byte
;
8243 ptrdiff_t saved_pt
= -1, saved_pt_byte
IF_LINT (= 0);
8244 bool need_marker_adjustment
= 0;
8245 bool kill_src_buffer
= 0;
8246 Lisp_Object old_deactivate_mark
;
8248 old_deactivate_mark
= Vdeactivate_mark
;
8250 coding
->src_object
= src_object
;
8251 coding
->src_chars
= chars
;
8252 coding
->src_bytes
= bytes
;
8253 coding
->src_multibyte
= chars
< bytes
;
8255 attrs
= CODING_ID_ATTRS (coding
->id
);
8257 if (EQ (src_object
, dst_object
))
8259 struct Lisp_Marker
*tail
;
8261 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
8263 tail
->need_adjustment
8264 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
8265 need_marker_adjustment
|= tail
->need_adjustment
;
8269 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
8271 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
8272 set_buffer_internal (XBUFFER (coding
->src_object
));
8273 if (STRINGP (src_object
))
8274 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
8275 else if (BUFFERP (src_object
))
8276 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
8278 insert_1_both ((char *) coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
8280 if (EQ (src_object
, dst_object
))
8282 set_buffer_internal (XBUFFER (src_object
));
8283 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
8284 del_range_both (from
, from_byte
, to
, to_byte
, 1);
8285 set_buffer_internal (XBUFFER (coding
->src_object
));
8289 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
8291 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
8292 old_deactivate_mark
);
8293 safe_call2 (CODING_ATTR_PRE_WRITE (attrs
),
8294 make_number (BEG
), make_number (Z
));
8297 if (XBUFFER (coding
->src_object
) != current_buffer
)
8298 kill_src_buffer
= 1;
8299 coding
->src_object
= Fcurrent_buffer ();
8301 move_gap_both (BEG
, BEG_BYTE
);
8302 coding
->src_chars
= Z
- BEG
;
8303 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
8304 coding
->src_pos
= BEG
;
8305 coding
->src_pos_byte
= BEG_BYTE
;
8306 coding
->src_multibyte
= Z
< Z_BYTE
;
8308 else if (STRINGP (src_object
))
8310 code_conversion_save (0, 0);
8311 coding
->src_pos
= from
;
8312 coding
->src_pos_byte
= from_byte
;
8314 else if (BUFFERP (src_object
))
8316 code_conversion_save (0, 0);
8317 set_buffer_internal (XBUFFER (src_object
));
8318 if (EQ (src_object
, dst_object
))
8320 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
8321 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
8322 coding
->src_pos
= 0;
8323 coding
->src_pos_byte
= 0;
8327 if (from
< GPT
&& to
>= GPT
)
8328 move_gap_both (from
, from_byte
);
8329 coding
->src_pos
= from
;
8330 coding
->src_pos_byte
= from_byte
;
8334 code_conversion_save (0, 0);
8336 if (BUFFERP (dst_object
))
8338 coding
->dst_object
= dst_object
;
8339 if (EQ (src_object
, dst_object
))
8341 coding
->dst_pos
= from
;
8342 coding
->dst_pos_byte
= from_byte
;
8346 struct buffer
*current
= current_buffer
;
8348 set_buffer_temp (XBUFFER (dst_object
));
8349 coding
->dst_pos
= PT
;
8350 coding
->dst_pos_byte
= PT_BYTE
;
8351 move_gap_both (coding
->dst_pos
, coding
->dst_pos_byte
);
8352 set_buffer_temp (current
);
8354 coding
->dst_multibyte
8355 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
8357 else if (EQ (dst_object
, Qt
))
8359 ptrdiff_t dst_bytes
= max (1, coding
->src_chars
);
8360 coding
->dst_object
= Qnil
;
8361 coding
->destination
= xmalloc (dst_bytes
);
8362 coding
->dst_bytes
= dst_bytes
;
8363 coding
->dst_multibyte
= 0;
8367 coding
->dst_object
= Qnil
;
8368 coding
->dst_multibyte
= 0;
8371 encode_coding (coding
);
8373 if (EQ (dst_object
, Qt
))
8375 if (BUFFERP (coding
->dst_object
))
8376 coding
->dst_object
= Fbuffer_string ();
8377 else if (coding
->raw_destination
)
8378 /* This is used to avoid creating huge Lisp string.
8379 NOTE: caller who sets `raw_destination' is also
8380 responsible for freeing `destination' buffer. */
8381 coding
->dst_object
= Qnil
;
8385 = make_unibyte_string ((char *) coding
->destination
,
8387 xfree (coding
->destination
);
8393 /* This is the case of:
8394 (BUFFERP (src_object) && EQ (src_object, dst_object))
8395 As we have moved PT while replacing the original buffer
8396 contents, we must recover it now. */
8397 set_buffer_internal (XBUFFER (src_object
));
8398 if (saved_pt
< from
)
8399 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
8400 else if (saved_pt
< from
+ chars
)
8401 TEMP_SET_PT_BOTH (from
, from_byte
);
8402 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8403 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
8404 saved_pt_byte
+ (coding
->produced
- bytes
));
8406 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
8407 saved_pt_byte
+ (coding
->produced
- bytes
));
8409 if (need_marker_adjustment
)
8411 struct Lisp_Marker
*tail
;
8413 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
8414 if (tail
->need_adjustment
)
8416 tail
->need_adjustment
= 0;
8417 if (tail
->insertion_type
)
8419 tail
->bytepos
= from_byte
;
8420 tail
->charpos
= from
;
8424 tail
->bytepos
= from_byte
+ coding
->produced
;
8426 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
8427 ? tail
->bytepos
: from
+ coding
->produced_char
);
8433 if (kill_src_buffer
)
8434 Fkill_buffer (coding
->src_object
);
8436 Vdeactivate_mark
= old_deactivate_mark
;
8437 unbind_to (count
, Qnil
);
8442 preferred_coding_system (void)
8444 int id
= coding_categories
[coding_priorities
[0]].id
;
8446 return CODING_ID_NAME (id
);
8449 #if defined (WINDOWSNT) || defined (CYGWIN)
8452 from_unicode (Lisp_Object str
)
8455 if (!STRING_MULTIBYTE (str
) &&
8458 str
= Fsubstring (str
, make_number (0), make_number (-1));
8461 return code_convert_string_norecord (str
, Qutf_16le
, 0);
8465 from_unicode_buffer (const wchar_t* wstr
)
8467 return from_unicode (
8468 make_unibyte_string (
8470 /* we get one of the two final 0 bytes for free. */
8471 1 + sizeof (wchar_t) * wcslen (wstr
)));
8475 to_unicode (Lisp_Object str
, Lisp_Object
*buf
)
8477 *buf
= code_convert_string_norecord (str
, Qutf_16le
, 1);
8478 /* We need to make another copy (in addition to the one made by
8479 code_convert_string_norecord) to ensure that the final string is
8480 _doubly_ zero terminated --- that is, that the string is
8481 terminated by two zero bytes and one utf-16le null character.
8482 Because strings are already terminated with a single zero byte,
8483 we just add one additional zero. */
8484 str
= make_uninit_string (SBYTES (*buf
) + 1);
8485 memcpy (SDATA (str
), SDATA (*buf
), SBYTES (*buf
));
8486 SDATA (str
) [SBYTES (*buf
)] = '\0';
8488 return WCSDATA (*buf
);
8491 #endif /* WINDOWSNT || CYGWIN */
8495 /*** 8. Emacs Lisp library functions ***/
8497 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
8498 doc
: /* Return t if OBJECT is nil or a coding-system.
8499 See the documentation of `define-coding-system' for information
8500 about coding-system objects. */)
8501 (Lisp_Object object
)
8504 || CODING_SYSTEM_ID (object
) >= 0)
8506 if (! SYMBOLP (object
)
8507 || NILP (Fget (object
, Qcoding_system_define_form
)))
8512 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
8513 Sread_non_nil_coding_system
, 1, 1, 0,
8514 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8515 (Lisp_Object prompt
)
8520 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8521 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
8523 while (SCHARS (val
) == 0);
8524 return (Fintern (val
, Qnil
));
8527 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
8528 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8529 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8530 Ignores case when completing coding systems (all Emacs coding systems
8531 are lower-case). */)
8532 (Lisp_Object prompt
, Lisp_Object default_coding_system
)
8535 ptrdiff_t count
= SPECPDL_INDEX ();
8537 if (SYMBOLP (default_coding_system
))
8538 default_coding_system
= SYMBOL_NAME (default_coding_system
);
8539 specbind (Qcompletion_ignore_case
, Qt
);
8540 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8541 Qt
, Qnil
, Qcoding_system_history
,
8542 default_coding_system
, Qnil
);
8543 unbind_to (count
, Qnil
);
8544 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
8547 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
8549 doc
: /* Check validity of CODING-SYSTEM.
8550 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8551 It is valid if it is nil or a symbol defined as a coding system by the
8552 function `define-coding-system'. */)
8553 (Lisp_Object coding_system
)
8555 Lisp_Object define_form
;
8557 define_form
= Fget (coding_system
, Qcoding_system_define_form
);
8558 if (! NILP (define_form
))
8560 Fput (coding_system
, Qcoding_system_define_form
, Qnil
);
8561 safe_eval (define_form
);
8563 if (!NILP (Fcoding_system_p (coding_system
)))
8564 return coding_system
;
8565 xsignal1 (Qcoding_system_error
, coding_system
);
8569 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8570 HIGHEST, return the coding system of the highest
8571 priority among the detected coding systems. Otherwise return a
8572 list of detected coding systems sorted by their priorities. If
8573 MULTIBYTEP, it is assumed that the bytes are in correct
8574 multibyte form but contains only ASCII and eight-bit chars.
8575 Otherwise, the bytes are raw bytes.
8577 CODING-SYSTEM controls the detection as below:
8579 If it is nil, detect both text-format and eol-format. If the
8580 text-format part of CODING-SYSTEM is already specified
8581 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8582 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8583 detect only text-format. */
8586 detect_coding_system (const unsigned char *src
,
8587 ptrdiff_t src_chars
, ptrdiff_t src_bytes
,
8588 bool highest
, bool multibytep
,
8589 Lisp_Object coding_system
)
8591 const unsigned char *src_end
= src
+ src_bytes
;
8592 Lisp_Object attrs
, eol_type
;
8593 Lisp_Object val
= Qnil
;
8594 struct coding_system coding
;
8596 struct coding_detection_info detect_info
;
8597 enum coding_category base_category
;
8598 bool null_byte_found
= 0, eight_bit_found
= 0;
8600 if (NILP (coding_system
))
8601 coding_system
= Qundecided
;
8602 setup_coding_system (coding_system
, &coding
);
8603 attrs
= CODING_ID_ATTRS (coding
.id
);
8604 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
8605 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
8607 coding
.source
= src
;
8608 coding
.src_chars
= src_chars
;
8609 coding
.src_bytes
= src_bytes
;
8610 coding
.src_multibyte
= multibytep
;
8611 coding
.consumed
= 0;
8612 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8613 coding
.head_ascii
= 0;
8615 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
8617 /* At first, detect text-format if necessary. */
8618 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8619 if (base_category
== coding_category_undecided
)
8621 enum coding_category category
IF_LINT (= 0);
8622 struct coding_system
*this IF_LINT (= NULL
);
8624 bool inhibit_nbd
= inhibit_flag (coding
.spec
.undecided
.inhibit_nbd
,
8625 inhibit_null_byte_detection
);
8626 bool inhibit_ied
= inhibit_flag (coding
.spec
.undecided
.inhibit_ied
,
8627 inhibit_iso_escape_detection
);
8628 bool prefer_utf_8
= coding
.spec
.undecided
.prefer_utf_8
;
8630 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8631 for (; src
< src_end
; src
++)
8636 eight_bit_found
= 1;
8637 if (null_byte_found
)
8642 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
8644 && ! detect_info
.checked
)
8646 if (detect_coding_iso_2022 (&coding
, &detect_info
))
8648 /* We have scanned the whole data. */
8649 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
8651 /* We didn't find an 8-bit code. We may
8652 have found a null-byte, but it's very
8653 rare that a binary file confirm to
8656 coding
.head_ascii
= src
- coding
.source
;
8658 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
8662 else if (! c
&& !inhibit_nbd
)
8664 null_byte_found
= 1;
8665 if (eight_bit_found
)
8668 if (! eight_bit_found
)
8669 coding
.head_ascii
++;
8671 else if (! eight_bit_found
)
8672 coding
.head_ascii
++;
8675 if (null_byte_found
|| eight_bit_found
8676 || coding
.head_ascii
< coding
.src_bytes
8677 || detect_info
.found
)
8679 if (coding
.head_ascii
== coding
.src_bytes
)
8680 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8681 for (i
= 0; i
< coding_category_raw_text
; i
++)
8683 category
= coding_priorities
[i
];
8684 this = coding_categories
+ category
;
8685 if (detect_info
.found
& (1 << category
))
8690 if (null_byte_found
)
8692 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
8693 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
8695 else if (prefer_utf_8
8696 && detect_coding_utf_8 (&coding
, &detect_info
))
8698 detect_info
.checked
|= ~CATEGORY_MASK_UTF_8
;
8699 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_8
;
8701 for (i
= 0; i
< coding_category_raw_text
; i
++)
8703 category
= coding_priorities
[i
];
8704 this = coding_categories
+ category
;
8708 /* No coding system of this category is defined. */
8709 detect_info
.rejected
|= (1 << category
);
8711 else if (category
>= coding_category_raw_text
)
8713 else if (detect_info
.checked
& (1 << category
))
8716 && (detect_info
.found
& (1 << category
)))
8719 else if ((*(this->detector
)) (&coding
, &detect_info
)
8721 && (detect_info
.found
& (1 << category
)))
8723 if (category
== coding_category_utf_16_auto
)
8725 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8726 category
= coding_category_utf_16_le
;
8728 category
= coding_category_utf_16_be
;
8736 if ((detect_info
.rejected
& CATEGORY_MASK_ANY
) == CATEGORY_MASK_ANY
8739 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
8740 id
= CODING_SYSTEM_ID (Qno_conversion
);
8741 val
= list1 (make_number (id
));
8743 else if (! detect_info
.rejected
&& ! detect_info
.found
)
8745 detect_info
.found
= CATEGORY_MASK_ANY
;
8746 id
= coding_categories
[coding_category_undecided
].id
;
8747 val
= list1 (make_number (id
));
8751 if (detect_info
.found
)
8753 detect_info
.found
= 1 << category
;
8754 val
= list1 (make_number (this->id
));
8757 for (i
= 0; i
< coding_category_raw_text
; i
++)
8758 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
8760 detect_info
.found
= 1 << coding_priorities
[i
];
8761 id
= coding_categories
[coding_priorities
[i
]].id
;
8762 val
= list1 (make_number (id
));
8768 int mask
= detect_info
.rejected
| detect_info
.found
;
8771 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8773 category
= coding_priorities
[i
];
8774 if (! (mask
& (1 << category
)))
8776 found
|= 1 << category
;
8777 id
= coding_categories
[category
].id
;
8779 val
= list1 (make_number (id
));
8782 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8784 category
= coding_priorities
[i
];
8785 if (detect_info
.found
& (1 << category
))
8787 id
= coding_categories
[category
].id
;
8788 val
= Fcons (make_number (id
), val
);
8791 detect_info
.found
|= found
;
8794 else if (base_category
== coding_category_utf_8_auto
)
8796 if (detect_coding_utf_8 (&coding
, &detect_info
))
8798 struct coding_system
*this;
8800 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
8801 this = coding_categories
+ coding_category_utf_8_sig
;
8803 this = coding_categories
+ coding_category_utf_8_nosig
;
8804 val
= list1 (make_number (this->id
));
8807 else if (base_category
== coding_category_utf_16_auto
)
8809 if (detect_coding_utf_16 (&coding
, &detect_info
))
8811 struct coding_system
*this;
8813 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8814 this = coding_categories
+ coding_category_utf_16_le
;
8815 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
8816 this = coding_categories
+ coding_category_utf_16_be
;
8817 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
8818 this = coding_categories
+ coding_category_utf_16_be_nosig
;
8820 this = coding_categories
+ coding_category_utf_16_le_nosig
;
8821 val
= list1 (make_number (this->id
));
8826 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
8827 val
= list1 (make_number (coding
.id
));
8830 /* Then, detect eol-format if necessary. */
8832 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
= -1;
8835 if (VECTORP (eol_type
))
8837 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
8839 if (null_byte_found
)
8840 normal_eol
= EOL_SEEN_LF
;
8842 normal_eol
= detect_eol (coding
.source
, src_bytes
,
8843 coding_category_raw_text
);
8845 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
8846 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
8847 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
8848 coding_category_utf_16_be
);
8849 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
8850 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
8851 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
8852 coding_category_utf_16_le
);
8856 if (EQ (eol_type
, Qunix
))
8857 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
8858 else if (EQ (eol_type
, Qdos
))
8859 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
8861 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
8864 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
8866 enum coding_category category
;
8869 id
= XINT (XCAR (tail
));
8870 attrs
= CODING_ID_ATTRS (id
);
8871 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8872 eol_type
= CODING_ID_EOL_TYPE (id
);
8873 if (VECTORP (eol_type
))
8875 if (category
== coding_category_utf_16_be
8876 || category
== coding_category_utf_16_be_nosig
)
8877 this_eol
= utf_16_be_eol
;
8878 else if (category
== coding_category_utf_16_le
8879 || category
== coding_category_utf_16_le_nosig
)
8880 this_eol
= utf_16_le_eol
;
8882 this_eol
= normal_eol
;
8884 if (this_eol
== EOL_SEEN_LF
)
8885 XSETCAR (tail
, AREF (eol_type
, 0));
8886 else if (this_eol
== EOL_SEEN_CRLF
)
8887 XSETCAR (tail
, AREF (eol_type
, 1));
8888 else if (this_eol
== EOL_SEEN_CR
)
8889 XSETCAR (tail
, AREF (eol_type
, 2));
8891 XSETCAR (tail
, CODING_ID_NAME (id
));
8894 XSETCAR (tail
, CODING_ID_NAME (id
));
8898 return (highest
? (CONSP (val
) ? XCAR (val
) : Qnil
) : val
);
8902 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
8904 doc
: /* Detect coding system of the text in the region between START and END.
8905 Return a list of possible coding systems ordered by priority.
8906 The coding systems to try and their priorities follows what
8907 the function `coding-system-priority-list' (which see) returns.
8909 If only ASCII characters are found (except for such ISO-2022 control
8910 characters as ESC), it returns a list of single element `undecided'
8911 or its subsidiary coding system according to a detected end-of-line
8914 If optional argument HIGHEST is non-nil, return the coding system of
8915 highest priority. */)
8916 (Lisp_Object start
, Lisp_Object end
, Lisp_Object highest
)
8919 ptrdiff_t from_byte
, to_byte
;
8921 validate_region (&start
, &end
);
8922 from
= XINT (start
), to
= XINT (end
);
8923 from_byte
= CHAR_TO_BYTE (from
);
8924 to_byte
= CHAR_TO_BYTE (to
);
8926 if (from
< GPT
&& to
>= GPT
)
8927 move_gap_both (to
, to_byte
);
8929 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
8930 to
- from
, to_byte
- from_byte
,
8932 !NILP (BVAR (current_buffer
8933 , enable_multibyte_characters
)),
8937 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
8939 doc
: /* Detect coding system of the text in STRING.
8940 Return a list of possible coding systems ordered by priority.
8941 The coding systems to try and their priorities follows what
8942 the function `coding-system-priority-list' (which see) returns.
8944 If only ASCII characters are found (except for such ISO-2022 control
8945 characters as ESC), it returns a list of single element `undecided'
8946 or its subsidiary coding system according to a detected end-of-line
8949 If optional argument HIGHEST is non-nil, return the coding system of
8950 highest priority. */)
8951 (Lisp_Object string
, Lisp_Object highest
)
8953 CHECK_STRING (string
);
8955 return detect_coding_system (SDATA (string
),
8956 SCHARS (string
), SBYTES (string
),
8957 !NILP (highest
), STRING_MULTIBYTE (string
),
8963 char_encodable_p (int c
, Lisp_Object attrs
)
8966 struct charset
*charset
;
8967 Lisp_Object translation_table
;
8969 translation_table
= CODING_ATTR_TRANS_TBL (attrs
);
8970 if (! NILP (translation_table
))
8971 c
= translate_char (translation_table
, c
);
8972 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
8973 CONSP (tail
); tail
= XCDR (tail
))
8975 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
8976 if (CHAR_CHARSET_P (c
, charset
))
8979 return (! NILP (tail
));
8983 /* Return a list of coding systems that safely encode the text between
8984 START and END. If EXCLUDE is non-nil, it is a list of coding
8985 systems not to check. The returned list doesn't contain any such
8986 coding systems. In any case, if the text contains only ASCII or is
8987 unibyte, return t. */
8989 DEFUN ("find-coding-systems-region-internal",
8990 Ffind_coding_systems_region_internal
,
8991 Sfind_coding_systems_region_internal
, 2, 3, 0,
8992 doc
: /* Internal use only. */)
8993 (Lisp_Object start
, Lisp_Object end
, Lisp_Object exclude
)
8995 Lisp_Object coding_attrs_list
, safe_codings
;
8996 ptrdiff_t start_byte
, end_byte
;
8997 const unsigned char *p
, *pbeg
, *pend
;
8999 Lisp_Object tail
, elt
, work_table
;
9001 if (STRINGP (start
))
9003 if (!STRING_MULTIBYTE (start
)
9004 || SCHARS (start
) == SBYTES (start
))
9007 end_byte
= SBYTES (start
);
9011 CHECK_NUMBER_COERCE_MARKER (start
);
9012 CHECK_NUMBER_COERCE_MARKER (end
);
9013 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
9014 args_out_of_range (start
, end
);
9015 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
9017 start_byte
= CHAR_TO_BYTE (XINT (start
));
9018 end_byte
= CHAR_TO_BYTE (XINT (end
));
9019 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
9022 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
9024 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
9025 move_gap_both (XINT (start
), start_byte
);
9027 move_gap_both (XINT (end
), end_byte
);
9031 coding_attrs_list
= Qnil
;
9032 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
9034 || NILP (Fmemq (XCAR (tail
), exclude
)))
9038 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
9039 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
)))
9041 ASET (attrs
, coding_attr_trans_tbl
,
9042 get_translation_table (attrs
, 1, NULL
));
9043 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
9047 if (STRINGP (start
))
9048 p
= pbeg
= SDATA (start
);
9050 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
9051 pend
= p
+ (end_byte
- start_byte
);
9053 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
9054 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
9056 work_table
= Fmake_char_table (Qnil
, Qnil
);
9059 if (ASCII_BYTE_P (*p
))
9063 c
= STRING_CHAR_ADVANCE (p
);
9064 if (!NILP (char_table_ref (work_table
, c
)))
9065 /* This character was already checked. Ignore it. */
9068 charset_map_loaded
= 0;
9069 for (tail
= coding_attrs_list
; CONSP (tail
);)
9074 else if (char_encodable_p (c
, elt
))
9076 else if (CONSP (XCDR (tail
)))
9078 XSETCAR (tail
, XCAR (XCDR (tail
)));
9079 XSETCDR (tail
, XCDR (XCDR (tail
)));
9083 XSETCAR (tail
, Qnil
);
9087 if (charset_map_loaded
)
9089 ptrdiff_t p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
9091 if (STRINGP (start
))
9092 pbeg
= SDATA (start
);
9094 pbeg
= BYTE_POS_ADDR (start_byte
);
9095 p
= pbeg
+ p_offset
;
9096 pend
= pbeg
+ pend_offset
;
9098 char_table_set (work_table
, c
, Qt
);
9102 safe_codings
= list2 (Qraw_text
, Qno_conversion
);
9103 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
9104 if (! NILP (XCAR (tail
)))
9105 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
9107 return safe_codings
;
9111 DEFUN ("unencodable-char-position", Funencodable_char_position
,
9112 Sunencodable_char_position
, 3, 5, 0,
9114 Return position of first un-encodable character in a region.
9115 START and END specify the region and CODING-SYSTEM specifies the
9116 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9118 If optional 4th argument COUNT is non-nil, it specifies at most how
9119 many un-encodable characters to search. In this case, the value is a
9122 If optional 5th argument STRING is non-nil, it is a string to search
9123 for un-encodable characters. In that case, START and END are indexes
9125 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object count
, Lisp_Object string
)
9128 struct coding_system coding
;
9129 Lisp_Object attrs
, charset_list
, translation_table
;
9130 Lisp_Object positions
;
9132 const unsigned char *p
, *stop
, *pend
;
9133 bool ascii_compatible
;
9135 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
9136 attrs
= CODING_ID_ATTRS (coding
.id
);
9137 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
9139 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
9140 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9141 translation_table
= get_translation_table (attrs
, 1, NULL
);
9145 validate_region (&start
, &end
);
9146 from
= XINT (start
);
9148 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
9149 || (ascii_compatible
9150 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
9152 p
= CHAR_POS_ADDR (from
);
9153 pend
= CHAR_POS_ADDR (to
);
9154 if (from
< GPT
&& to
>= GPT
)
9161 CHECK_STRING (string
);
9162 CHECK_NATNUM (start
);
9164 if (! (XINT (start
) <= XINT (end
) && XINT (end
) <= SCHARS (string
)))
9165 args_out_of_range_3 (string
, start
, end
);
9166 from
= XINT (start
);
9168 if (! STRING_MULTIBYTE (string
))
9170 p
= SDATA (string
) + string_char_to_byte (string
, from
);
9171 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
9172 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
9180 CHECK_NATNUM (count
);
9185 charset_map_loaded
= 0;
9190 if (ascii_compatible
)
9191 while (p
< stop
&& ASCII_BYTE_P (*p
))
9201 c
= STRING_CHAR_ADVANCE (p
);
9202 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
9203 && ! char_charset (translate_char (translation_table
, c
),
9204 charset_list
, NULL
))
9206 positions
= Fcons (make_number (from
), positions
);
9213 if (charset_map_loaded
&& NILP (string
))
9215 p
= CHAR_POS_ADDR (from
);
9216 pend
= CHAR_POS_ADDR (to
);
9217 if (from
< GPT
&& to
>= GPT
)
9221 charset_map_loaded
= 0;
9225 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
9230 Scheck_coding_systems_region
, 3, 3, 0,
9231 doc
: /* Check if the region is encodable by coding systems.
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9244 START may be a string. In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions. END is ignored.
9248 If the current buffer (or START if it is a string) is unibyte, the value
9250 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system_list
)
9253 ptrdiff_t start_byte
, end_byte
;
9255 const unsigned char *p
, *pbeg
, *pend
;
9257 Lisp_Object tail
, elt
, attrs
;
9259 if (STRINGP (start
))
9261 if (!STRING_MULTIBYTE (start
)
9262 || SCHARS (start
) == SBYTES (start
))
9265 end_byte
= SBYTES (start
);
9270 CHECK_NUMBER_COERCE_MARKER (start
);
9271 CHECK_NUMBER_COERCE_MARKER (end
);
9272 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
9273 args_out_of_range (start
, end
);
9274 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
9276 start_byte
= CHAR_TO_BYTE (XINT (start
));
9277 end_byte
= CHAR_TO_BYTE (XINT (end
));
9278 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
9281 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
9283 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
9284 move_gap_both (XINT (start
), start_byte
);
9286 move_gap_both (XINT (end
), end_byte
);
9292 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
9295 attrs
= AREF (CODING_SYSTEM_SPEC (elt
), 0);
9296 ASET (attrs
, coding_attr_trans_tbl
,
9297 get_translation_table (attrs
, 1, NULL
));
9298 list
= Fcons (list2 (elt
, attrs
), list
);
9301 if (STRINGP (start
))
9302 p
= pbeg
= SDATA (start
);
9304 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
9305 pend
= p
+ (end_byte
- start_byte
);
9307 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
9308 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
9312 if (ASCII_BYTE_P (*p
))
9316 c
= STRING_CHAR_ADVANCE (p
);
9318 charset_map_loaded
= 0;
9319 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
9321 elt
= XCDR (XCAR (tail
));
9322 if (! char_encodable_p (c
, XCAR (elt
)))
9323 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
9325 if (charset_map_loaded
)
9327 ptrdiff_t p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
9329 if (STRINGP (start
))
9330 pbeg
= SDATA (start
);
9332 pbeg
= BYTE_POS_ADDR (start_byte
);
9333 p
= pbeg
+ p_offset
;
9334 pend
= pbeg
+ pend_offset
;
9342 for (; CONSP (tail
); tail
= XCDR (tail
))
9345 if (CONSP (XCDR (XCDR (elt
))))
9346 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
9355 code_convert_region (Lisp_Object start
, Lisp_Object end
,
9356 Lisp_Object coding_system
, Lisp_Object dst_object
,
9357 bool encodep
, bool norecord
)
9359 struct coding_system coding
;
9360 ptrdiff_t from
, from_byte
, to
, to_byte
;
9361 Lisp_Object src_object
;
9363 if (NILP (coding_system
))
9364 coding_system
= Qno_conversion
;
9366 CHECK_CODING_SYSTEM (coding_system
);
9367 src_object
= Fcurrent_buffer ();
9368 if (NILP (dst_object
))
9369 dst_object
= src_object
;
9370 else if (! EQ (dst_object
, Qt
))
9371 CHECK_BUFFER (dst_object
);
9373 validate_region (&start
, &end
);
9374 from
= XFASTINT (start
);
9375 from_byte
= CHAR_TO_BYTE (from
);
9376 to
= XFASTINT (end
);
9377 to_byte
= CHAR_TO_BYTE (to
);
9379 setup_coding_system (coding_system
, &coding
);
9380 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
9382 if (BUFFERP (dst_object
) && !EQ (dst_object
, src_object
))
9384 struct buffer
*buf
= XBUFFER (dst_object
);
9385 ptrdiff_t buf_pt
= BUF_PT (buf
);
9387 invalidate_buffer_caches (buf
, buf_pt
, buf_pt
);
9391 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
9394 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
9397 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
9399 return (BUFFERP (dst_object
)
9400 ? make_number (coding
.produced_char
)
9401 : coding
.dst_object
);
9405 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
9406 3, 4, "r\nzCoding system: ",
9407 doc
: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409 START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.) */)
9422 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
9424 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
9427 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
9428 3, 4, "r\nzCoding system: ",
9429 doc
: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431 START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9434 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replace by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.) */)
9444 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
9446 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
9450 code_convert_string (Lisp_Object string
, Lisp_Object coding_system
,
9451 Lisp_Object dst_object
, bool encodep
, bool nocopy
,
9454 struct coding_system coding
;
9455 ptrdiff_t chars
, bytes
;
9457 CHECK_STRING (string
);
9458 if (NILP (coding_system
))
9461 Vlast_coding_system_used
= Qno_conversion
;
9462 if (NILP (dst_object
))
9463 return (nocopy
? Fcopy_sequence (string
) : string
);
9466 if (NILP (coding_system
))
9467 coding_system
= Qno_conversion
;
9469 CHECK_CODING_SYSTEM (coding_system
);
9470 if (NILP (dst_object
))
9472 else if (! EQ (dst_object
, Qt
))
9473 CHECK_BUFFER (dst_object
);
9475 setup_coding_system (coding_system
, &coding
);
9476 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
9477 chars
= SCHARS (string
);
9478 bytes
= SBYTES (string
);
9480 if (BUFFERP (dst_object
))
9482 struct buffer
*buf
= XBUFFER (dst_object
);
9483 ptrdiff_t buf_pt
= BUF_PT (buf
);
9485 invalidate_buffer_caches (buf
, buf_pt
, buf_pt
);
9489 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
9491 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
9493 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
9495 return (BUFFERP (dst_object
)
9496 ? make_number (coding
.produced_char
)
9497 : coding
.dst_object
);
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502 Do not set Vlast_coding_system_used.
9504 This function is called only from macros DECODE_FILE and
9505 ENCODE_FILE, thus we ignore character composition. */
9508 code_convert_string_norecord (Lisp_Object string
, Lisp_Object coding_system
,
9511 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515 for passing to C library functions. */
9517 decode_file_name (Lisp_Object fname
)
9520 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521 converts the file names either to UTF-16LE or to the system ANSI
9522 codepage internally, depending on the underlying OS; see w32.c. */
9523 if (! NILP (Fcoding_system_p (Qutf_8
)))
9524 return code_convert_string_norecord (fname
, Qutf_8
, 0);
9526 #else /* !WINDOWSNT */
9527 if (! NILP (Vfile_name_coding_system
))
9528 return code_convert_string_norecord (fname
, Vfile_name_coding_system
, 0);
9529 else if (! NILP (Vdefault_file_name_coding_system
))
9530 return code_convert_string_norecord (fname
,
9531 Vdefault_file_name_coding_system
, 0);
9538 encode_file_name (Lisp_Object fname
)
9540 /* This is especially important during bootstrap and dumping, when
9541 file-name encoding is not yet known, and therefore any non-ASCII
9542 file names are unibyte strings, and could only be thrashed if we
9543 try to encode them. */
9544 if (!STRING_MULTIBYTE (fname
))
9547 /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548 converts the file names either to UTF-16LE or to the system ANSI
9549 codepage internally, depending on the underlying OS; see w32.c. */
9550 if (! NILP (Fcoding_system_p (Qutf_8
)))
9551 return code_convert_string_norecord (fname
, Qutf_8
, 1);
9553 #else /* !WINDOWSNT */
9554 if (! NILP (Vfile_name_coding_system
))
9555 return code_convert_string_norecord (fname
, Vfile_name_coding_system
, 1);
9556 else if (! NILP (Vdefault_file_name_coding_system
))
9557 return code_convert_string_norecord (fname
,
9558 Vdefault_file_name_coding_system
, 1);
9564 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
9566 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move). In this
9573 case, the return value is the length of the decoded text.
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.) */)
9578 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9580 return code_convert_string (string
, coding_system
, buffer
,
9581 0, ! NILP (nocopy
), 0);
9584 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
9586 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move). In this
9593 case, the return value is the length of the encoded text.
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.) */)
9598 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9600 return code_convert_string (string
, coding_system
, buffer
,
9601 1, ! NILP (nocopy
), 0);
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
9606 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character. */)
9610 Lisp_Object spec
, attrs
, val
;
9611 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
9615 CHECK_NATNUM (code
);
9616 ch
= XFASTINT (code
);
9617 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9618 attrs
= AREF (spec
, 0);
9620 if (ASCII_BYTE_P (ch
)
9621 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9624 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9625 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9626 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9627 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9632 charset
= charset_roman
;
9634 else if (ch
>= 0xA0 && ch
< 0xDF)
9637 charset
= charset_kana
;
9641 EMACS_INT c1
= ch
>> 8;
9644 if (c1
< 0x81 || (c1
> 0x9F && c1
< 0xE0) || c1
> 0xEF
9645 || c2
< 0x40 || c2
== 0x7F || c2
> 0xFC)
9646 error ("Invalid code: %"pI
"d", ch
);
9649 charset
= charset_kanji
;
9651 c
= DECODE_CHAR (charset
, c
);
9653 error ("Invalid code: %"pI
"d", ch
);
9654 return make_number (c
);
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
9659 doc
: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS. */)
9663 Lisp_Object spec
, attrs
, charset_list
;
9665 struct charset
*charset
;
9668 CHECK_CHARACTER (ch
);
9670 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9671 attrs
= AREF (spec
, 0);
9673 if (ASCII_CHAR_P (c
)
9674 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9677 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9678 charset
= char_charset (c
, charset_list
, &code
);
9679 if (code
== CHARSET_INVALID_CODE (charset
))
9680 error ("Can't encode by shift_jis encoding: %c", c
);
9683 return make_number (code
);
9686 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
9687 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character. */)
9691 Lisp_Object spec
, attrs
, val
;
9692 struct charset
*charset_roman
, *charset_big5
, *charset
;
9696 CHECK_NATNUM (code
);
9697 ch
= XFASTINT (code
);
9698 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9699 attrs
= AREF (spec
, 0);
9701 if (ASCII_BYTE_P (ch
)
9702 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9705 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9706 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9707 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9712 charset
= charset_roman
;
9716 EMACS_INT b1
= ch
>> 8;
9718 if (b1
< 0xA1 || b1
> 0xFE
9719 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
9720 error ("Invalid code: %"pI
"d", ch
);
9722 charset
= charset_big5
;
9724 c
= DECODE_CHAR (charset
, c
);
9726 error ("Invalid code: %"pI
"d", ch
);
9727 return make_number (c
);
9730 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
9731 doc
: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5. */)
9735 Lisp_Object spec
, attrs
, charset_list
;
9736 struct charset
*charset
;
9740 CHECK_CHARACTER (ch
);
9742 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9743 attrs
= AREF (spec
, 0);
9744 if (ASCII_CHAR_P (c
)
9745 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9748 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9749 charset
= char_charset (c
, charset_list
, &code
);
9750 if (code
== CHARSET_INVALID_CODE (charset
))
9751 error ("Can't encode by Big5 encoding: %c", c
);
9753 return make_number (code
);
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal
,
9758 Sset_terminal_coding_system_internal
, 1, 2, 0,
9759 doc
: /* Internal use only. */)
9760 (Lisp_Object coding_system
, Lisp_Object terminal
)
9762 struct terminal
*term
= get_terminal (terminal
, 1);
9763 struct coding_system
*terminal_coding
= TERMINAL_TERMINAL_CODING (term
);
9764 CHECK_SYMBOL (coding_system
);
9765 setup_coding_system (Fcheck_coding_system (coding_system
), terminal_coding
);
9766 /* We had better not send unsafe characters to terminal. */
9767 terminal_coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
9768 /* Character composition should be disabled. */
9769 terminal_coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9770 terminal_coding
->src_multibyte
= 1;
9771 terminal_coding
->dst_multibyte
= 0;
9773 (term
, (terminal_coding
->common_flags
& CODING_REQUIRE_ENCODING_MASK
9774 ? coding_charset_list (terminal_coding
)
9775 : list1 (make_number (charset_ascii
))));
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780 Fset_safe_terminal_coding_system_internal
,
9781 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
9782 doc
: /* Internal use only. */)
9783 (Lisp_Object coding_system
)
9785 CHECK_SYMBOL (coding_system
);
9786 setup_coding_system (Fcheck_coding_system (coding_system
),
9787 &safe_terminal_coding
);
9788 /* Character composition should be disabled. */
9789 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9790 safe_terminal_coding
.src_multibyte
= 1;
9791 safe_terminal_coding
.dst_multibyte
= 0;
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system
,
9796 Sterminal_coding_system
, 0, 1, 0,
9797 doc
: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device. */)
9800 (Lisp_Object terminal
)
9802 struct coding_system
*terminal_coding
9803 = TERMINAL_TERMINAL_CODING (get_terminal (terminal
, 1));
9804 Lisp_Object coding_system
= CODING_ID_NAME (terminal_coding
->id
);
9806 /* For backward compatibility, return nil if it is `undecided'. */
9807 return (! EQ (coding_system
, Qundecided
) ? coding_system
: Qnil
);
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal
,
9811 Sset_keyboard_coding_system_internal
, 1, 2, 0,
9812 doc
: /* Internal use only. */)
9813 (Lisp_Object coding_system
, Lisp_Object terminal
)
9815 struct terminal
*t
= get_terminal (terminal
, 1);
9816 CHECK_SYMBOL (coding_system
);
9817 if (NILP (coding_system
))
9818 coding_system
= Qno_conversion
;
9820 Fcheck_coding_system (coding_system
);
9821 setup_coding_system (coding_system
, TERMINAL_KEYBOARD_CODING (t
));
9822 /* Character composition should be disabled. */
9823 TERMINAL_KEYBOARD_CODING (t
)->common_flags
9824 &= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9828 DEFUN ("keyboard-coding-system",
9829 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 1, 0,
9830 doc
: /* Return coding system specified for decoding keyboard input. */)
9831 (Lisp_Object terminal
)
9833 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834 (get_terminal (terminal
, 1))->id
);
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
9839 Sfind_operation_coding_system
, 1, MANY
, 0,
9840 doc
: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9846 The first argument OPERATION specifies an I/O primitive:
9847 For file I/O, `insert-file-contents' or `write-region'.
9848 For process I/O, `call-process', `call-process-region', or `start-process'.
9849 For network I/O, `open-network-stream'.
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive. Depending on which primitive, one of those arguments
9853 is selected as the TARGET. For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9856 TARGET has a meaning which depends on OPERATION:
9857 For file I/O, TARGET is a file name (except for the special case below).
9858 For process I/O, TARGET is a process name.
9859 For network I/O, TARGET is a service name or a port number.
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded). If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9879 (ptrdiff_t nargs
, Lisp_Object
*args
)
9881 Lisp_Object operation
, target_idx
, target
, val
;
9882 register Lisp_Object chain
;
9885 error ("Too few arguments");
9886 operation
= args
[0];
9887 if (!SYMBOLP (operation
)
9888 || (target_idx
= Fget (operation
, Qtarget_idx
), !NATNUMP (target_idx
)))
9889 error ("Invalid first argument");
9890 if (nargs
<= 1 + XFASTINT (target_idx
))
9891 error ("Too few arguments for operation `%s'",
9892 SDATA (SYMBOL_NAME (operation
)));
9893 target
= args
[XFASTINT (target_idx
) + 1];
9894 if (!(STRINGP (target
)
9895 || (EQ (operation
, Qinsert_file_contents
) && CONSP (target
)
9896 && STRINGP (XCAR (target
)) && BUFFERP (XCDR (target
)))
9897 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
9898 error ("Invalid argument %"pI
"d of operation `%s'",
9899 XFASTINT (target_idx
) + 1, SDATA (SYMBOL_NAME (operation
)));
9901 target
= XCAR (target
);
9903 chain
= ((EQ (operation
, Qinsert_file_contents
)
9904 || EQ (operation
, Qwrite_region
))
9905 ? Vfile_coding_system_alist
9906 : (EQ (operation
, Qopen_network_stream
)
9907 ? Vnetwork_coding_system_alist
9908 : Vprocess_coding_system_alist
));
9912 for (; CONSP (chain
); chain
= XCDR (chain
))
9918 && ((STRINGP (target
)
9919 && STRINGP (XCAR (elt
))
9920 && fast_string_match (XCAR (elt
), target
) >= 0)
9921 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
9924 /* Here, if VAL is both a valid coding system and a valid
9925 function symbol, we return VAL as a coding system. */
9928 if (! SYMBOLP (val
))
9930 if (! NILP (Fcoding_system_p (val
)))
9931 return Fcons (val
, val
);
9932 if (! NILP (Ffboundp (val
)))
9934 /* We use call1 rather than safe_call1
9935 so as to get bug reports about functions called here
9936 which don't handle the current interface. */
9937 val
= call1 (val
, Flist (nargs
, args
));
9940 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
9941 return Fcons (val
, val
);
9949 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
9950 Sset_coding_system_priority
, 0, MANY
, 0,
9951 doc
: /* Assign higher priority to the coding systems given as arguments.
9952 If multiple coding systems belong to the same category,
9953 all but the first one are ignored.
9955 usage: (set-coding-system-priority &rest coding-systems) */)
9956 (ptrdiff_t nargs
, Lisp_Object
*args
)
9959 bool changed
[coding_category_max
];
9960 enum coding_category priorities
[coding_category_max
];
9962 memset (changed
, 0, sizeof changed
);
9964 for (i
= j
= 0; i
< nargs
; i
++)
9966 enum coding_category category
;
9967 Lisp_Object spec
, attrs
;
9969 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
9970 attrs
= AREF (spec
, 0);
9971 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
9972 if (changed
[category
])
9973 /* Ignore this coding system because a coding system of the
9974 same category already had a higher priority. */
9976 changed
[category
] = 1;
9977 priorities
[j
++] = category
;
9978 if (coding_categories
[category
].id
>= 0
9979 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
9980 setup_coding_system (args
[i
], &coding_categories
[category
]);
9981 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
9984 /* Now we have decided top J priorities. Reflect the order of the
9985 original priorities to the remaining priorities. */
9987 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
9989 while (j
< coding_category_max
9990 && changed
[coding_priorities
[j
]])
9992 if (j
== coding_category_max
)
9994 priorities
[i
] = coding_priorities
[j
];
9997 memcpy (coding_priorities
, priorities
, sizeof priorities
);
9999 /* Update `coding-category-list'. */
10000 Vcoding_category_list
= Qnil
;
10001 for (i
= coding_category_max
; i
-- > 0; )
10002 Vcoding_category_list
10003 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
10004 Vcoding_category_list
);
10009 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
10010 Scoding_system_priority_list
, 0, 1, 0,
10011 doc
: /* Return a list of coding systems ordered by their priorities.
10012 The list contains a subset of coding systems; i.e. coding systems
10013 assigned to each coding category (see `coding-category-list').
10015 HIGHESTP non-nil means just return the highest priority one. */)
10016 (Lisp_Object highestp
)
10021 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
10023 enum coding_category category
= coding_priorities
[i
];
10024 int id
= coding_categories
[category
].id
;
10029 attrs
= CODING_ID_ATTRS (id
);
10030 if (! NILP (highestp
))
10031 return CODING_ATTR_BASE_NAME (attrs
);
10032 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
10034 return Fnreverse (val
);
10037 static const char *const suffixes
[] = { "-unix", "-dos", "-mac" };
10040 make_subsidiaries (Lisp_Object base
)
10042 Lisp_Object subsidiaries
;
10043 ptrdiff_t base_name_len
= SBYTES (SYMBOL_NAME (base
));
10044 char *buf
= alloca (base_name_len
+ 6);
10047 memcpy (buf
, SDATA (SYMBOL_NAME (base
)), base_name_len
);
10048 subsidiaries
= make_uninit_vector (3);
10049 for (i
= 0; i
< 3; i
++)
10051 strcpy (buf
+ base_name_len
, suffixes
[i
]);
10052 ASET (subsidiaries
, i
, intern (buf
));
10054 return subsidiaries
;
10058 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
10059 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
10060 doc
: /* For internal use only.
10061 usage: (define-coding-system-internal ...) */)
10062 (ptrdiff_t nargs
, Lisp_Object
*args
)
10065 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
10066 Lisp_Object attrs
; /* Vector of attributes. */
10067 Lisp_Object eol_type
;
10068 Lisp_Object aliases
;
10069 Lisp_Object coding_type
, charset_list
, safe_charsets
;
10070 enum coding_category category
;
10071 Lisp_Object tail
, val
;
10072 int max_charset_id
= 0;
10075 if (nargs
< coding_arg_max
)
10078 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
10080 name
= args
[coding_arg_name
];
10081 CHECK_SYMBOL (name
);
10082 ASET (attrs
, coding_attr_base_name
, name
);
10084 val
= args
[coding_arg_mnemonic
];
10085 if (! STRINGP (val
))
10086 CHECK_CHARACTER (val
);
10087 ASET (attrs
, coding_attr_mnemonic
, val
);
10089 coding_type
= args
[coding_arg_coding_type
];
10090 CHECK_SYMBOL (coding_type
);
10091 ASET (attrs
, coding_attr_type
, coding_type
);
10093 charset_list
= args
[coding_arg_charset_list
];
10094 if (SYMBOLP (charset_list
))
10096 if (EQ (charset_list
, Qiso_2022
))
10098 if (! EQ (coding_type
, Qiso_2022
))
10099 error ("Invalid charset-list");
10100 charset_list
= Viso_2022_charset_list
;
10102 else if (EQ (charset_list
, Qemacs_mule
))
10104 if (! EQ (coding_type
, Qemacs_mule
))
10105 error ("Invalid charset-list");
10106 charset_list
= Vemacs_mule_charset_list
;
10108 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
10110 if (! RANGED_INTEGERP (0, XCAR (tail
), INT_MAX
- 1))
10111 error ("Invalid charset-list");
10112 if (max_charset_id
< XFASTINT (XCAR (tail
)))
10113 max_charset_id
= XFASTINT (XCAR (tail
));
10118 charset_list
= Fcopy_sequence (charset_list
);
10119 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
10121 struct charset
*charset
;
10124 CHECK_CHARSET_GET_CHARSET (val
, charset
);
10125 if (EQ (coding_type
, Qiso_2022
)
10126 ? CHARSET_ISO_FINAL (charset
) < 0
10127 : EQ (coding_type
, Qemacs_mule
)
10128 ? CHARSET_EMACS_MULE_ID (charset
) < 0
10130 error ("Can't handle charset `%s'",
10131 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10133 XSETCAR (tail
, make_number (charset
->id
));
10134 if (max_charset_id
< charset
->id
)
10135 max_charset_id
= charset
->id
;
10138 ASET (attrs
, coding_attr_charset_list
, charset_list
);
10140 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
10141 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
10142 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
10143 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
10144 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
10146 ASET (attrs
, coding_attr_ascii_compat
, args
[coding_arg_ascii_compatible_p
]);
10148 val
= args
[coding_arg_decode_translation_table
];
10149 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10150 CHECK_SYMBOL (val
);
10151 ASET (attrs
, coding_attr_decode_tbl
, val
);
10153 val
= args
[coding_arg_encode_translation_table
];
10154 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10155 CHECK_SYMBOL (val
);
10156 ASET (attrs
, coding_attr_encode_tbl
, val
);
10158 val
= args
[coding_arg_post_read_conversion
];
10159 CHECK_SYMBOL (val
);
10160 ASET (attrs
, coding_attr_post_read
, val
);
10162 val
= args
[coding_arg_pre_write_conversion
];
10163 CHECK_SYMBOL (val
);
10164 ASET (attrs
, coding_attr_pre_write
, val
);
10166 val
= args
[coding_arg_default_char
];
10168 ASET (attrs
, coding_attr_default_char
, make_number (' '));
10171 CHECK_CHARACTER (val
);
10172 ASET (attrs
, coding_attr_default_char
, val
);
10175 val
= args
[coding_arg_for_unibyte
];
10176 ASET (attrs
, coding_attr_for_unibyte
, NILP (val
) ? Qnil
: Qt
);
10178 val
= args
[coding_arg_plist
];
10180 ASET (attrs
, coding_attr_plist
, val
);
10182 if (EQ (coding_type
, Qcharset
))
10184 /* Generate a lisp vector of 256 elements. Each element is nil,
10185 integer, or a list of charset IDs.
10187 If Nth element is nil, the byte code N is invalid in this
10190 If Nth element is a number NUM, N is the first byte of a
10191 charset whose ID is NUM.
10193 If Nth element is a list of charset IDs, N is the first byte
10194 of one of them. The list is sorted by dimensions of the
10195 charsets. A charset of smaller dimension comes first. */
10196 val
= Fmake_vector (make_number (256), Qnil
);
10198 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
10200 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
10201 int dim
= CHARSET_DIMENSION (charset
);
10202 int idx
= (dim
- 1) * 4;
10204 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
10205 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10207 for (i
= charset
->code_space
[idx
];
10208 i
<= charset
->code_space
[idx
+ 1]; i
++)
10210 Lisp_Object tmp
, tmp2
;
10213 tmp
= AREF (val
, i
);
10216 else if (NUMBERP (tmp
))
10218 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
10220 tmp
= list2 (XCAR (tail
), tmp
);
10222 tmp
= list2 (tmp
, XCAR (tail
));
10226 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
10228 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
10233 tmp
= nconc2 (tmp
, list1 (XCAR (tail
)));
10236 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
10237 XSETCAR (tmp2
, XCAR (tail
));
10240 ASET (val
, i
, tmp
);
10243 ASET (attrs
, coding_attr_charset_valids
, val
);
10244 category
= coding_category_charset
;
10246 else if (EQ (coding_type
, Qccl
))
10248 Lisp_Object valids
;
10250 if (nargs
< coding_arg_ccl_max
)
10253 val
= args
[coding_arg_ccl_decoder
];
10254 CHECK_CCL_PROGRAM (val
);
10256 val
= Fcopy_sequence (val
);
10257 ASET (attrs
, coding_attr_ccl_decoder
, val
);
10259 val
= args
[coding_arg_ccl_encoder
];
10260 CHECK_CCL_PROGRAM (val
);
10262 val
= Fcopy_sequence (val
);
10263 ASET (attrs
, coding_attr_ccl_encoder
, val
);
10265 val
= args
[coding_arg_ccl_valids
];
10266 valids
= Fmake_string (make_number (256), make_number (0));
10267 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
10272 if (INTEGERP (val
))
10274 if (! (0 <= XINT (val
) && XINT (val
) <= 255))
10275 args_out_of_range_3 (val
, make_number (0), make_number (255));
10276 from
= to
= XINT (val
);
10281 CHECK_NATNUM_CAR (val
);
10282 CHECK_NUMBER_CDR (val
);
10283 if (XINT (XCAR (val
)) > 255)
10284 args_out_of_range_3 (XCAR (val
),
10285 make_number (0), make_number (255));
10286 from
= XINT (XCAR (val
));
10287 if (! (from
<= XINT (XCDR (val
)) && XINT (XCDR (val
)) <= 255))
10288 args_out_of_range_3 (XCDR (val
),
10289 XCAR (val
), make_number (255));
10290 to
= XINT (XCDR (val
));
10292 for (i
= from
; i
<= to
; i
++)
10293 SSET (valids
, i
, 1);
10295 ASET (attrs
, coding_attr_ccl_valids
, valids
);
10297 category
= coding_category_ccl
;
10299 else if (EQ (coding_type
, Qutf_16
))
10301 Lisp_Object bom
, endian
;
10303 ASET (attrs
, coding_attr_ascii_compat
, Qnil
);
10305 if (nargs
< coding_arg_utf16_max
)
10308 bom
= args
[coding_arg_utf16_bom
];
10309 if (! NILP (bom
) && ! EQ (bom
, Qt
))
10313 CHECK_CODING_SYSTEM (val
);
10315 CHECK_CODING_SYSTEM (val
);
10317 ASET (attrs
, coding_attr_utf_bom
, bom
);
10319 endian
= args
[coding_arg_utf16_endian
];
10320 CHECK_SYMBOL (endian
);
10323 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
10324 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
10325 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
10327 category
= (CONSP (bom
)
10328 ? coding_category_utf_16_auto
10330 ? (EQ (endian
, Qbig
)
10331 ? coding_category_utf_16_be_nosig
10332 : coding_category_utf_16_le_nosig
)
10333 : (EQ (endian
, Qbig
)
10334 ? coding_category_utf_16_be
10335 : coding_category_utf_16_le
));
10337 else if (EQ (coding_type
, Qiso_2022
))
10339 Lisp_Object initial
, reg_usage
, request
, flags
;
10341 if (nargs
< coding_arg_iso2022_max
)
10344 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
10345 CHECK_VECTOR (initial
);
10346 for (i
= 0; i
< 4; i
++)
10348 val
= AREF (initial
, i
);
10351 struct charset
*charset
;
10353 CHECK_CHARSET_GET_CHARSET (val
, charset
);
10354 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
10355 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
10356 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10359 ASET (initial
, i
, make_number (-1));
10362 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
10363 CHECK_CONS (reg_usage
);
10364 CHECK_NUMBER_CAR (reg_usage
);
10365 CHECK_NUMBER_CDR (reg_usage
);
10367 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
10368 for (tail
= request
; CONSP (tail
); tail
= XCDR (tail
))
10376 CHECK_CHARSET_GET_ID (tmp1
, id
);
10377 CHECK_NATNUM_CDR (val
);
10378 if (XINT (XCDR (val
)) >= 4)
10379 error ("Invalid graphic register number: %"pI
"d", XINT (XCDR (val
)));
10380 XSETCAR (val
, make_number (id
));
10383 flags
= args
[coding_arg_iso2022_flags
];
10384 CHECK_NATNUM (flags
);
10385 i
= XINT (flags
) & INT_MAX
;
10386 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
10387 i
|= CODING_ISO_FLAG_FULL_SUPPORT
;
10388 flags
= make_number (i
);
10390 ASET (attrs
, coding_attr_iso_initial
, initial
);
10391 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
10392 ASET (attrs
, coding_attr_iso_request
, request
);
10393 ASET (attrs
, coding_attr_iso_flags
, flags
);
10394 setup_iso_safe_charsets (attrs
);
10396 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
10397 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
10398 | CODING_ISO_FLAG_SINGLE_SHIFT
))
10399 ? coding_category_iso_7_else
10400 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
10401 ? coding_category_iso_7
10402 : coding_category_iso_7_tight
);
10405 int id
= XINT (AREF (initial
, 1));
10407 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
10408 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
10410 ? coding_category_iso_8_else
10411 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
10412 ? coding_category_iso_8_1
10413 : coding_category_iso_8_2
);
10415 if (category
!= coding_category_iso_8_1
10416 && category
!= coding_category_iso_8_2
)
10417 ASET (attrs
, coding_attr_ascii_compat
, Qnil
);
10419 else if (EQ (coding_type
, Qemacs_mule
))
10421 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
10422 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
10423 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10424 category
= coding_category_emacs_mule
;
10426 else if (EQ (coding_type
, Qshift_jis
))
10429 struct charset
*charset
;
10431 if (XINT (Flength (charset_list
)) != 3
10432 && XINT (Flength (charset_list
)) != 4)
10433 error ("There should be three or four charsets");
10435 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10436 if (CHARSET_DIMENSION (charset
) != 1)
10437 error ("Dimension of charset %s is not one",
10438 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10439 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
10440 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10442 charset_list
= XCDR (charset_list
);
10443 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10444 if (CHARSET_DIMENSION (charset
) != 1)
10445 error ("Dimension of charset %s is not one",
10446 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10448 charset_list
= XCDR (charset_list
);
10449 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10450 if (CHARSET_DIMENSION (charset
) != 2)
10451 error ("Dimension of charset %s is not two",
10452 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10454 charset_list
= XCDR (charset_list
);
10455 if (! NILP (charset_list
))
10457 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10458 if (CHARSET_DIMENSION (charset
) != 2)
10459 error ("Dimension of charset %s is not two",
10460 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10463 category
= coding_category_sjis
;
10464 Vsjis_coding_system
= name
;
10466 else if (EQ (coding_type
, Qbig5
))
10468 struct charset
*charset
;
10470 if (XINT (Flength (charset_list
)) != 2)
10471 error ("There should be just two charsets");
10473 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10474 if (CHARSET_DIMENSION (charset
) != 1)
10475 error ("Dimension of charset %s is not one",
10476 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10477 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
10478 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10480 charset_list
= XCDR (charset_list
);
10481 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
10482 if (CHARSET_DIMENSION (charset
) != 2)
10483 error ("Dimension of charset %s is not two",
10484 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
10486 category
= coding_category_big5
;
10487 Vbig5_coding_system
= name
;
10489 else if (EQ (coding_type
, Qraw_text
))
10491 category
= coding_category_raw_text
;
10492 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10494 else if (EQ (coding_type
, Qutf_8
))
10498 if (nargs
< coding_arg_utf8_max
)
10501 bom
= args
[coding_arg_utf8_bom
];
10502 if (! NILP (bom
) && ! EQ (bom
, Qt
))
10506 CHECK_CODING_SYSTEM (val
);
10508 CHECK_CODING_SYSTEM (val
);
10510 ASET (attrs
, coding_attr_utf_bom
, bom
);
10512 ASET (attrs
, coding_attr_ascii_compat
, Qt
);
10514 category
= (CONSP (bom
) ? coding_category_utf_8_auto
10515 : NILP (bom
) ? coding_category_utf_8_nosig
10516 : coding_category_utf_8_sig
);
10518 else if (EQ (coding_type
, Qundecided
))
10520 if (nargs
< coding_arg_undecided_max
)
10522 ASET (attrs
, coding_attr_undecided_inhibit_null_byte_detection
,
10523 args
[coding_arg_undecided_inhibit_null_byte_detection
]);
10524 ASET (attrs
, coding_attr_undecided_inhibit_iso_escape_detection
,
10525 args
[coding_arg_undecided_inhibit_iso_escape_detection
]);
10526 ASET (attrs
, coding_attr_undecided_prefer_utf_8
,
10527 args
[coding_arg_undecided_prefer_utf_8
]);
10528 category
= coding_category_undecided
;
10531 error ("Invalid coding system type: %s",
10532 SDATA (SYMBOL_NAME (coding_type
)));
10534 ASET (attrs
, coding_attr_category
, make_number (category
));
10535 ASET (attrs
, coding_attr_plist
,
10537 Fcons (AREF (Vcoding_category_table
, category
),
10538 CODING_ATTR_PLIST (attrs
))));
10539 ASET (attrs
, coding_attr_plist
,
10540 Fcons (QCascii_compatible_p
,
10541 Fcons (CODING_ATTR_ASCII_COMPAT (attrs
),
10542 CODING_ATTR_PLIST (attrs
))));
10544 eol_type
= args
[coding_arg_eol_type
];
10545 if (! NILP (eol_type
)
10546 && ! EQ (eol_type
, Qunix
)
10547 && ! EQ (eol_type
, Qdos
)
10548 && ! EQ (eol_type
, Qmac
))
10549 error ("Invalid eol-type");
10551 aliases
= list1 (name
);
10553 if (NILP (eol_type
))
10555 eol_type
= make_subsidiaries (name
);
10556 for (i
= 0; i
< 3; i
++)
10558 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
10560 this_name
= AREF (eol_type
, i
);
10561 this_aliases
= list1 (this_name
);
10562 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
10563 this_spec
= make_uninit_vector (3);
10564 ASET (this_spec
, 0, attrs
);
10565 ASET (this_spec
, 1, this_aliases
);
10566 ASET (this_spec
, 2, this_eol_type
);
10567 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
10568 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
10569 val
= Fassoc (Fsymbol_name (this_name
), Vcoding_system_alist
);
10571 Vcoding_system_alist
10572 = Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
10573 Vcoding_system_alist
);
10577 spec_vec
= make_uninit_vector (3);
10578 ASET (spec_vec
, 0, attrs
);
10579 ASET (spec_vec
, 1, aliases
);
10580 ASET (spec_vec
, 2, eol_type
);
10582 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
10583 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
10584 val
= Fassoc (Fsymbol_name (name
), Vcoding_system_alist
);
10586 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
10587 Vcoding_system_alist
);
10590 int id
= coding_categories
[category
].id
;
10592 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
10593 setup_coding_system (name
, &coding_categories
[category
]);
10599 return Fsignal (Qwrong_number_of_arguments
,
10600 Fcons (intern ("define-coding-system-internal"),
10601 make_number (nargs
)));
10605 DEFUN ("coding-system-put", Fcoding_system_put
, Scoding_system_put
,
10607 doc
: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10608 (Lisp_Object coding_system
, Lisp_Object prop
, Lisp_Object val
)
10610 Lisp_Object spec
, attrs
;
10612 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10613 attrs
= AREF (spec
, 0);
10614 if (EQ (prop
, QCmnemonic
))
10616 if (! STRINGP (val
))
10617 CHECK_CHARACTER (val
);
10618 ASET (attrs
, coding_attr_mnemonic
, val
);
10620 else if (EQ (prop
, QCdefault_char
))
10623 val
= make_number (' ');
10625 CHECK_CHARACTER (val
);
10626 ASET (attrs
, coding_attr_default_char
, val
);
10628 else if (EQ (prop
, QCdecode_translation_table
))
10630 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10631 CHECK_SYMBOL (val
);
10632 ASET (attrs
, coding_attr_decode_tbl
, val
);
10634 else if (EQ (prop
, QCencode_translation_table
))
10636 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10637 CHECK_SYMBOL (val
);
10638 ASET (attrs
, coding_attr_encode_tbl
, val
);
10640 else if (EQ (prop
, QCpost_read_conversion
))
10642 CHECK_SYMBOL (val
);
10643 ASET (attrs
, coding_attr_post_read
, val
);
10645 else if (EQ (prop
, QCpre_write_conversion
))
10647 CHECK_SYMBOL (val
);
10648 ASET (attrs
, coding_attr_pre_write
, val
);
10650 else if (EQ (prop
, QCascii_compatible_p
))
10652 ASET (attrs
, coding_attr_ascii_compat
, val
);
10655 ASET (attrs
, coding_attr_plist
,
10656 Fplist_put (CODING_ATTR_PLIST (attrs
), prop
, val
));
10661 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
10662 Sdefine_coding_system_alias
, 2, 2, 0,
10663 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10664 (Lisp_Object alias
, Lisp_Object coding_system
)
10666 Lisp_Object spec
, aliases
, eol_type
, val
;
10668 CHECK_SYMBOL (alias
);
10669 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10670 aliases
= AREF (spec
, 1);
10671 /* ALIASES should be a list of length more than zero, and the first
10672 element is a base coding system. Append ALIAS at the tail of the
10674 while (!NILP (XCDR (aliases
)))
10675 aliases
= XCDR (aliases
);
10676 XSETCDR (aliases
, list1 (alias
));
10678 eol_type
= AREF (spec
, 2);
10679 if (VECTORP (eol_type
))
10681 Lisp_Object subsidiaries
;
10684 subsidiaries
= make_subsidiaries (alias
);
10685 for (i
= 0; i
< 3; i
++)
10686 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
10687 AREF (eol_type
, i
));
10690 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
10691 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
10692 val
= Fassoc (Fsymbol_name (alias
), Vcoding_system_alist
);
10694 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
10695 Vcoding_system_alist
);
10700 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
10702 doc
: /* Return the base of CODING-SYSTEM.
10703 Any alias or subsidiary coding system is not a base coding system. */)
10704 (Lisp_Object coding_system
)
10706 Lisp_Object spec
, attrs
;
10708 if (NILP (coding_system
))
10709 return (Qno_conversion
);
10710 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10711 attrs
= AREF (spec
, 0);
10712 return CODING_ATTR_BASE_NAME (attrs
);
10715 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
10717 doc
: "Return the property list of CODING-SYSTEM.")
10718 (Lisp_Object coding_system
)
10720 Lisp_Object spec
, attrs
;
10722 if (NILP (coding_system
))
10723 coding_system
= Qno_conversion
;
10724 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10725 attrs
= AREF (spec
, 0);
10726 return CODING_ATTR_PLIST (attrs
);
10730 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
10732 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
10733 (Lisp_Object coding_system
)
10737 if (NILP (coding_system
))
10738 coding_system
= Qno_conversion
;
10739 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10740 return AREF (spec
, 1);
10743 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
10744 Scoding_system_eol_type
, 1, 1, 0,
10745 doc
: /* Return eol-type of CODING-SYSTEM.
10746 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10748 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10749 and CR respectively.
10751 A vector value indicates that a format of end-of-line should be
10752 detected automatically. Nth element of the vector is the subsidiary
10753 coding system whose eol-type is N. */)
10754 (Lisp_Object coding_system
)
10756 Lisp_Object spec
, eol_type
;
10759 if (NILP (coding_system
))
10760 coding_system
= Qno_conversion
;
10761 if (! CODING_SYSTEM_P (coding_system
))
10763 spec
= CODING_SYSTEM_SPEC (coding_system
);
10764 eol_type
= AREF (spec
, 2);
10765 if (VECTORP (eol_type
))
10766 return Fcopy_sequence (eol_type
);
10767 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
10768 return make_number (n
);
10774 /*** 9. Post-amble ***/
10777 init_coding_once (void)
10781 for (i
= 0; i
< coding_category_max
; i
++)
10783 coding_categories
[i
].id
= -1;
10784 coding_priorities
[i
] = i
;
10787 /* ISO2022 specific initialize routine. */
10788 for (i
= 0; i
< 0x20; i
++)
10789 iso_code_class
[i
] = ISO_control_0
;
10790 for (i
= 0x21; i
< 0x7F; i
++)
10791 iso_code_class
[i
] = ISO_graphic_plane_0
;
10792 for (i
= 0x80; i
< 0xA0; i
++)
10793 iso_code_class
[i
] = ISO_control_1
;
10794 for (i
= 0xA1; i
< 0xFF; i
++)
10795 iso_code_class
[i
] = ISO_graphic_plane_1
;
10796 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
10797 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
10798 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
10799 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
10800 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
10801 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
10802 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
10803 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
10804 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
10806 for (i
= 0; i
< 256; i
++)
10808 emacs_mule_bytes
[i
] = 1;
10810 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
10811 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
10812 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
10813 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
10819 syms_of_coding (void)
10821 staticpro (&Vcoding_system_hash_table
);
10823 Lisp_Object args
[2];
10826 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
10829 staticpro (&Vsjis_coding_system
);
10830 Vsjis_coding_system
= Qnil
;
10832 staticpro (&Vbig5_coding_system
);
10833 Vbig5_coding_system
= Qnil
;
10835 staticpro (&Vcode_conversion_reused_workbuf
);
10836 Vcode_conversion_reused_workbuf
= Qnil
;
10838 staticpro (&Vcode_conversion_workbuf_name
);
10839 Vcode_conversion_workbuf_name
= build_pure_c_string (" *code-conversion-work*");
10841 reused_workbuf_in_use
= 0;
10843 DEFSYM (Qcharset
, "charset");
10844 DEFSYM (Qtarget_idx
, "target-idx");
10845 DEFSYM (Qcoding_system_history
, "coding-system-history");
10846 Fset (Qcoding_system_history
, Qnil
);
10848 /* Target FILENAME is the first argument. */
10849 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
10850 /* Target FILENAME is the third argument. */
10851 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
10853 DEFSYM (Qcall_process
, "call-process");
10854 /* Target PROGRAM is the first argument. */
10855 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
10857 DEFSYM (Qcall_process_region
, "call-process-region");
10858 /* Target PROGRAM is the third argument. */
10859 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
10861 DEFSYM (Qstart_process
, "start-process");
10862 /* Target PROGRAM is the third argument. */
10863 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
10865 DEFSYM (Qopen_network_stream
, "open-network-stream");
10866 /* Target SERVICE is the fourth argument. */
10867 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
10869 DEFSYM (Qcoding_system
, "coding-system");
10870 DEFSYM (Qcoding_aliases
, "coding-aliases");
10872 DEFSYM (Qeol_type
, "eol-type");
10873 DEFSYM (Qunix
, "unix");
10874 DEFSYM (Qdos
, "dos");
10875 DEFSYM (Qmac
, "mac");
10877 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
10878 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
10879 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
10880 DEFSYM (Qdefault_char
, "default-char");
10881 DEFSYM (Qundecided
, "undecided");
10882 DEFSYM (Qno_conversion
, "no-conversion");
10883 DEFSYM (Qraw_text
, "raw-text");
10885 DEFSYM (Qiso_2022
, "iso-2022");
10887 DEFSYM (Qutf_8
, "utf-8");
10888 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
10890 #if defined (WINDOWSNT) || defined (CYGWIN)
10891 /* No, not utf-16-le: that one has a BOM. */
10892 DEFSYM (Qutf_16le
, "utf-16le");
10895 DEFSYM (Qutf_16
, "utf-16");
10896 DEFSYM (Qbig
, "big");
10897 DEFSYM (Qlittle
, "little");
10899 DEFSYM (Qshift_jis
, "shift-jis");
10900 DEFSYM (Qbig5
, "big5");
10902 DEFSYM (Qcoding_system_p
, "coding-system-p");
10904 DEFSYM (Qcoding_system_error
, "coding-system-error");
10905 Fput (Qcoding_system_error
, Qerror_conditions
,
10906 listn (CONSTYPE_PURE
, 2, Qcoding_system_error
, Qerror
));
10907 Fput (Qcoding_system_error
, Qerror_message
,
10908 build_pure_c_string ("Invalid coding system"));
10910 DEFSYM (Qtranslation_table
, "translation-table");
10911 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (2));
10912 DEFSYM (Qtranslation_table_id
, "translation-table-id");
10913 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
10914 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
10916 DEFSYM (Qvalid_codes
, "valid-codes");
10918 DEFSYM (Qemacs_mule
, "emacs-mule");
10920 DEFSYM (QCcategory
, ":category");
10921 DEFSYM (QCmnemonic
, ":mnemonic");
10922 DEFSYM (QCdefault_char
, ":default-char");
10923 DEFSYM (QCdecode_translation_table
, ":decode-translation-table");
10924 DEFSYM (QCencode_translation_table
, ":encode-translation-table");
10925 DEFSYM (QCpost_read_conversion
, ":post-read-conversion");
10926 DEFSYM (QCpre_write_conversion
, ":pre-write-conversion");
10927 DEFSYM (QCascii_compatible_p
, ":ascii-compatible-p");
10929 Vcoding_category_table
10930 = Fmake_vector (make_number (coding_category_max
), Qnil
);
10931 staticpro (&Vcoding_category_table
);
10932 /* Followings are target of code detection. */
10933 ASET (Vcoding_category_table
, coding_category_iso_7
,
10934 intern_c_string ("coding-category-iso-7"));
10935 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
10936 intern_c_string ("coding-category-iso-7-tight"));
10937 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
10938 intern_c_string ("coding-category-iso-8-1"));
10939 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
10940 intern_c_string ("coding-category-iso-8-2"));
10941 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
10942 intern_c_string ("coding-category-iso-7-else"));
10943 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
10944 intern_c_string ("coding-category-iso-8-else"));
10945 ASET (Vcoding_category_table
, coding_category_utf_8_auto
,
10946 intern_c_string ("coding-category-utf-8-auto"));
10947 ASET (Vcoding_category_table
, coding_category_utf_8_nosig
,
10948 intern_c_string ("coding-category-utf-8"));
10949 ASET (Vcoding_category_table
, coding_category_utf_8_sig
,
10950 intern_c_string ("coding-category-utf-8-sig"));
10951 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
10952 intern_c_string ("coding-category-utf-16-be"));
10953 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
10954 intern_c_string ("coding-category-utf-16-auto"));
10955 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
10956 intern_c_string ("coding-category-utf-16-le"));
10957 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
10958 intern_c_string ("coding-category-utf-16-be-nosig"));
10959 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
10960 intern_c_string ("coding-category-utf-16-le-nosig"));
10961 ASET (Vcoding_category_table
, coding_category_charset
,
10962 intern_c_string ("coding-category-charset"));
10963 ASET (Vcoding_category_table
, coding_category_sjis
,
10964 intern_c_string ("coding-category-sjis"));
10965 ASET (Vcoding_category_table
, coding_category_big5
,
10966 intern_c_string ("coding-category-big5"));
10967 ASET (Vcoding_category_table
, coding_category_ccl
,
10968 intern_c_string ("coding-category-ccl"));
10969 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
10970 intern_c_string ("coding-category-emacs-mule"));
10971 /* Followings are NOT target of code detection. */
10972 ASET (Vcoding_category_table
, coding_category_raw_text
,
10973 intern_c_string ("coding-category-raw-text"));
10974 ASET (Vcoding_category_table
, coding_category_undecided
,
10975 intern_c_string ("coding-category-undecided"));
10977 DEFSYM (Qinsufficient_source
, "insufficient-source");
10978 DEFSYM (Qinvalid_source
, "invalid-source");
10979 DEFSYM (Qinterrupted
, "interrupted");
10980 DEFSYM (Qcoding_system_define_form
, "coding-system-define-form");
10982 defsubr (&Scoding_system_p
);
10983 defsubr (&Sread_coding_system
);
10984 defsubr (&Sread_non_nil_coding_system
);
10985 defsubr (&Scheck_coding_system
);
10986 defsubr (&Sdetect_coding_region
);
10987 defsubr (&Sdetect_coding_string
);
10988 defsubr (&Sfind_coding_systems_region_internal
);
10989 defsubr (&Sunencodable_char_position
);
10990 defsubr (&Scheck_coding_systems_region
);
10991 defsubr (&Sdecode_coding_region
);
10992 defsubr (&Sencode_coding_region
);
10993 defsubr (&Sdecode_coding_string
);
10994 defsubr (&Sencode_coding_string
);
10995 defsubr (&Sdecode_sjis_char
);
10996 defsubr (&Sencode_sjis_char
);
10997 defsubr (&Sdecode_big5_char
);
10998 defsubr (&Sencode_big5_char
);
10999 defsubr (&Sset_terminal_coding_system_internal
);
11000 defsubr (&Sset_safe_terminal_coding_system_internal
);
11001 defsubr (&Sterminal_coding_system
);
11002 defsubr (&Sset_keyboard_coding_system_internal
);
11003 defsubr (&Skeyboard_coding_system
);
11004 defsubr (&Sfind_operation_coding_system
);
11005 defsubr (&Sset_coding_system_priority
);
11006 defsubr (&Sdefine_coding_system_internal
);
11007 defsubr (&Sdefine_coding_system_alias
);
11008 defsubr (&Scoding_system_put
);
11009 defsubr (&Scoding_system_base
);
11010 defsubr (&Scoding_system_plist
);
11011 defsubr (&Scoding_system_aliases
);
11012 defsubr (&Scoding_system_eol_type
);
11013 defsubr (&Scoding_system_priority_list
);
11015 DEFVAR_LISP ("coding-system-list", Vcoding_system_list
,
11016 doc
: /* List of coding systems.
11018 Do not alter the value of this variable manually. This variable should be
11019 updated by the functions `define-coding-system' and
11020 `define-coding-system-alias'. */);
11021 Vcoding_system_list
= Qnil
;
11023 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist
,
11024 doc
: /* Alist of coding system names.
11025 Each element is one element list of coding system name.
11026 This variable is given to `completing-read' as COLLECTION argument.
11028 Do not alter the value of this variable manually. This variable should be
11029 updated by the functions `make-coding-system' and
11030 `define-coding-system-alias'. */);
11031 Vcoding_system_alist
= Qnil
;
11033 DEFVAR_LISP ("coding-category-list", Vcoding_category_list
,
11034 doc
: /* List of coding-categories (symbols) ordered by priority.
11036 On detecting a coding system, Emacs tries code detection algorithms
11037 associated with each coding-category one by one in this order. When
11038 one algorithm agrees with a byte sequence of source text, the coding
11039 system bound to the corresponding coding-category is selected.
11041 Don't modify this variable directly, but use `set-coding-system-priority'. */);
11045 Vcoding_category_list
= Qnil
;
11046 for (i
= coding_category_max
- 1; i
>= 0; i
--)
11047 Vcoding_category_list
11048 = Fcons (AREF (Vcoding_category_table
, i
),
11049 Vcoding_category_list
);
11052 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read
,
11053 doc
: /* Specify the coding system for read operations.
11054 It is useful to bind this variable with `let', but do not set it globally.
11055 If the value is a coding system, it is used for decoding on read operation.
11056 If not, an appropriate element is used from one of the coding system alists.
11057 There are three such tables: `file-coding-system-alist',
11058 `process-coding-system-alist', and `network-coding-system-alist'. */);
11059 Vcoding_system_for_read
= Qnil
;
11061 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write
,
11062 doc
: /* Specify the coding system for write operations.
11063 Programs bind this variable with `let', but you should not set it globally.
11064 If the value is a coding system, it is used for encoding of output,
11065 when writing it to a file and when sending it to a file or subprocess.
11067 If this does not specify a coding system, an appropriate element
11068 is used from one of the coding system alists.
11069 There are three such tables: `file-coding-system-alist',
11070 `process-coding-system-alist', and `network-coding-system-alist'.
11071 For output to files, if the above procedure does not specify a coding system,
11072 the value of `buffer-file-coding-system' is used. */);
11073 Vcoding_system_for_write
= Qnil
;
11075 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used
,
11077 Coding system used in the latest file or process I/O. */);
11078 Vlast_coding_system_used
= Qnil
;
11080 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error
,
11082 Error status of the last code conversion.
11084 When an error was detected in the last code conversion, this variable
11085 is set to one of the following symbols.
11086 `insufficient-source'
11090 `insufficient-memory'
11091 When no error was detected, the value doesn't change. So, to check
11092 the error status of a code conversion by this variable, you must
11093 explicitly set this variable to nil before performing code
11095 Vlast_code_conversion_error
= Qnil
;
11097 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion
,
11099 *Non-nil means always inhibit code conversion of end-of-line format.
11100 See info node `Coding Systems' and info node `Text and Binary' concerning
11101 such conversion. */);
11102 inhibit_eol_conversion
= 0;
11104 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system
,
11106 Non-nil means process buffer inherits coding system of process output.
11107 Bind it to t if the process output is to be treated as if it were a file
11108 read from some filesystem. */);
11109 inherit_process_coding_system
= 0;
11111 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist
,
11113 Alist to decide a coding system to use for a file I/O operation.
11114 The format is ((PATTERN . VAL) ...),
11115 where PATTERN is a regular expression matching a file name,
11116 VAL is a coding system, a cons of coding systems, or a function symbol.
11117 If VAL is a coding system, it is used for both decoding and encoding
11119 If VAL is a cons of coding systems, the car part is used for decoding,
11120 and the cdr part is used for encoding.
11121 If VAL is a function symbol, the function must return a coding system
11122 or a cons of coding systems which are used as above. The function is
11123 called with an argument that is a list of the arguments with which
11124 `find-operation-coding-system' was called. If the function can't decide
11125 a coding system, it can return `undecided' so that the normal
11126 code-detection is performed.
11128 See also the function `find-operation-coding-system'
11129 and the variable `auto-coding-alist'. */);
11130 Vfile_coding_system_alist
= Qnil
;
11132 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist
,
11134 Alist to decide a coding system to use for a process I/O operation.
11135 The format is ((PATTERN . VAL) ...),
11136 where PATTERN is a regular expression matching a program name,
11137 VAL is a coding system, a cons of coding systems, or a function symbol.
11138 If VAL is a coding system, it is used for both decoding what received
11139 from the program and encoding what sent to the program.
11140 If VAL is a cons of coding systems, the car part is used for decoding,
11141 and the cdr part is used for encoding.
11142 If VAL is a function symbol, the function must return a coding system
11143 or a cons of coding systems which are used as above.
11145 See also the function `find-operation-coding-system'. */);
11146 Vprocess_coding_system_alist
= Qnil
;
11148 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist
,
11150 Alist to decide a coding system to use for a network I/O operation.
11151 The format is ((PATTERN . VAL) ...),
11152 where PATTERN is a regular expression matching a network service name
11153 or is a port number to connect to,
11154 VAL is a coding system, a cons of coding systems, or a function symbol.
11155 If VAL is a coding system, it is used for both decoding what received
11156 from the network stream and encoding what sent to the network stream.
11157 If VAL is a cons of coding systems, the car part is used for decoding,
11158 and the cdr part is used for encoding.
11159 If VAL is a function symbol, the function must return a coding system
11160 or a cons of coding systems which are used as above.
11162 See also the function `find-operation-coding-system'. */);
11163 Vnetwork_coding_system_alist
= Qnil
;
11165 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system
,
11166 doc
: /* Coding system to use with system messages.
11167 Also used for decoding keyboard input on X Window system. */);
11168 Vlocale_coding_system
= Qnil
;
11170 /* The eol mnemonics are reset in startup.el system-dependently. */
11171 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix
,
11173 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11174 eol_mnemonic_unix
= build_pure_c_string (":");
11176 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos
,
11178 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11179 eol_mnemonic_dos
= build_pure_c_string ("\\");
11181 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac
,
11183 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11184 eol_mnemonic_mac
= build_pure_c_string ("/");
11186 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided
,
11188 *String displayed in mode line when end-of-line format is not yet determined. */);
11189 eol_mnemonic_undecided
= build_pure_c_string (":");
11191 DEFVAR_LISP ("enable-character-translation", Venable_character_translation
,
11193 *Non-nil enables character translation while encoding and decoding. */);
11194 Venable_character_translation
= Qt
;
11196 DEFVAR_LISP ("standard-translation-table-for-decode",
11197 Vstandard_translation_table_for_decode
,
11198 doc
: /* Table for translating characters while decoding. */);
11199 Vstandard_translation_table_for_decode
= Qnil
;
11201 DEFVAR_LISP ("standard-translation-table-for-encode",
11202 Vstandard_translation_table_for_encode
,
11203 doc
: /* Table for translating characters while encoding. */);
11204 Vstandard_translation_table_for_encode
= Qnil
;
11206 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table
,
11207 doc
: /* Alist of charsets vs revision numbers.
11208 While encoding, if a charset (car part of an element) is found,
11209 designate it with the escape sequence identifying revision (cdr part
11210 of the element). */);
11211 Vcharset_revision_table
= Qnil
;
11213 DEFVAR_LISP ("default-process-coding-system",
11214 Vdefault_process_coding_system
,
11215 doc
: /* Cons of coding systems used for process I/O by default.
11216 The car part is used for decoding a process output,
11217 the cdr part is used for encoding a text to be sent to a process. */);
11218 Vdefault_process_coding_system
= Qnil
;
11220 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table
,
11222 Table of extra Latin codes in the range 128..159 (inclusive).
11223 This is a vector of length 256.
11224 If Nth element is non-nil, the existence of code N in a file
11225 \(or output of subprocess) doesn't prevent it to be detected as
11226 a coding system of ISO 2022 variant which has a flag
11227 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11228 or reading output of a subprocess.
11229 Only 128th through 159th elements have a meaning. */);
11230 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
11232 DEFVAR_LISP ("select-safe-coding-system-function",
11233 Vselect_safe_coding_system_function
,
11235 Function to call to select safe coding system for encoding a text.
11237 If set, this function is called to force a user to select a proper
11238 coding system which can encode the text in the case that a default
11239 coding system used in each operation can't encode the text. The
11240 function should take care that the buffer is not modified while
11241 the coding system is being selected.
11243 The default value is `select-safe-coding-system' (which see). */);
11244 Vselect_safe_coding_system_function
= Qnil
;
11246 DEFVAR_BOOL ("coding-system-require-warning",
11247 coding_system_require_warning
,
11248 doc
: /* Internal use only.
11249 If non-nil, on writing a file, `select-safe-coding-system-function' is
11250 called even if `coding-system-for-write' is non-nil. The command
11251 `universal-coding-system-argument' binds this variable to t temporarily. */);
11252 coding_system_require_warning
= 0;
11255 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11256 inhibit_iso_escape_detection
,
11258 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11260 When Emacs reads text, it tries to detect how the text is encoded.
11261 This code detection is sensitive to escape sequences. If Emacs sees
11262 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11263 of the ISO2022 encodings, and decodes text by the corresponding coding
11264 system (e.g. `iso-2022-7bit').
11266 However, there may be a case that you want to read escape sequences in
11267 a file as is. In such a case, you can set this variable to non-nil.
11268 Then the code detection will ignore any escape sequences, and no text is
11269 detected as encoded in some ISO-2022 encoding. The result is that all
11270 escape sequences become visible in a buffer.
11272 The default value is nil, and it is strongly recommended not to change
11273 it. That is because many Emacs Lisp source files that contain
11274 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11275 in Emacs's distribution, and they won't be decoded correctly on
11276 reading if you suppress escape sequence detection.
11278 The other way to read escape sequences in a file without decoding is
11279 to explicitly specify some coding system that doesn't use ISO-2022
11280 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11281 inhibit_iso_escape_detection
= 0;
11283 DEFVAR_BOOL ("inhibit-null-byte-detection",
11284 inhibit_null_byte_detection
,
11285 doc
: /* If non-nil, Emacs ignores null bytes on code detection.
11286 By default, Emacs treats it as binary data, and does not attempt to
11287 decode it. The effect is as if you specified `no-conversion' for
11290 Set this to non-nil when a regular text happens to include null bytes.
11291 Examples are Index nodes of Info files and null-byte delimited output
11292 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11293 decode text as usual. */);
11294 inhibit_null_byte_detection
= 0;
11296 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization
,
11297 doc
: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11298 Internal use only. Remove after the experimental optimizer becomes stable. */);
11299 disable_ascii_optimization
= 0;
11301 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input
,
11302 doc
: /* Char table for translating self-inserting characters.
11303 This is applied to the result of input methods, not their input.
11304 See also `keyboard-translate-table'.
11306 Use of this variable for character code unification was rendered
11307 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11308 internal character representation. */);
11309 Vtranslation_table_for_input
= Qnil
;
11312 Lisp_Object args
[coding_arg_undecided_max
];
11313 Lisp_Object plist
[16];
11316 for (i
= 0; i
< coding_arg_undecided_max
; i
++)
11319 plist
[0] = intern_c_string (":name");
11320 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
11321 plist
[2] = intern_c_string (":mnemonic");
11322 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
11323 plist
[4] = intern_c_string (":coding-type");
11324 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
11325 plist
[6] = intern_c_string (":ascii-compatible-p");
11326 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
11327 plist
[8] = intern_c_string (":default-char");
11328 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
11329 plist
[10] = intern_c_string (":for-unibyte");
11330 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
11331 plist
[12] = intern_c_string (":docstring");
11332 plist
[13] = build_pure_c_string ("Do no conversion.\n\
11334 When you visit a file with this coding, the file is read into a\n\
11335 unibyte buffer as is, thus each byte of a file is treated as a\n\
11337 plist
[14] = intern_c_string (":eol-type");
11338 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
11339 args
[coding_arg_plist
] = Flist (16, plist
);
11340 Fdefine_coding_system_internal (coding_arg_max
, args
);
11342 plist
[1] = args
[coding_arg_name
] = Qundecided
;
11343 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('-');
11344 plist
[5] = args
[coding_arg_coding_type
] = Qundecided
;
11345 /* This is already set.
11346 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11347 plist
[8] = intern_c_string (":charset-list");
11348 plist
[9] = args
[coding_arg_charset_list
] = Fcons (Qascii
, Qnil
);
11349 plist
[11] = args
[coding_arg_for_unibyte
] = Qnil
;
11350 plist
[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11351 plist
[15] = args
[coding_arg_eol_type
] = Qnil
;
11352 args
[coding_arg_plist
] = Flist (16, plist
);
11353 args
[coding_arg_undecided_inhibit_null_byte_detection
] = make_number (0);
11354 args
[coding_arg_undecided_inhibit_iso_escape_detection
] = make_number (0);
11355 Fdefine_coding_system_internal (coding_arg_undecided_max
, args
);
11358 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
11363 for (i
= 0; i
< coding_category_max
; i
++)
11364 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
11366 #if defined (DOS_NT)
11367 system_eol_type
= Qdos
;
11369 system_eol_type
= Qunix
;
11371 staticpro (&system_eol_type
);
11375 emacs_strerror (int error_number
)
11379 synchronize_system_messages_locale ();
11380 str
= strerror (error_number
);
11382 if (! NILP (Vlocale_coding_system
))
11384 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
11385 Vlocale_coding_system
,
11387 str
= SSDATA (dec
);