(calendar-location-name, calendar-latitude)
[emacs.git] / src / coding.c
blob901f81a9247713c123133882f6e746098c846990
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
12 This file is part of GNU Emacs.
14 GNU Emacs is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 3, or (at your option)
17 any later version.
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs; see the file COPYING. If not, write to
26 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 Boston, MA 02110-1301, USA. */
29 /*** TABLE OF CONTENTS ***
31 0. General comments
32 1. Preamble
33 2. Emacs' internal format (emacs-utf-8) handlers
34 3. UTF-8 handlers
35 4. UTF-16 handlers
36 5. Charset-base coding systems handlers
37 6. emacs-mule (old Emacs' internal format) handlers
38 7. ISO2022 handlers
39 8. Shift-JIS and BIG5 handlers
40 9. CCL handlers
41 10. C library functions
42 11. Emacs Lisp library functions
43 12. Postamble
47 /*** 0. General comments ***
50 CODING SYSTEM
52 A coding system is an object for an encoding mechanism that contains
53 information about how to convert byte sequences to character
54 sequences and vice versa. When we say "decode", it means converting
55 a byte sequence of a specific coding system into a character
56 sequence that is represented by Emacs' internal coding system
57 `emacs-utf-8', and when we say "encode", it means converting a
58 character sequence of emacs-utf-8 to a byte sequence of a specific
59 coding system.
61 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
62 C level, a coding system is represented by a vector of attributes
63 stored in the hash table Vcharset_hash_table. The conversion from
64 coding system symbol to attributes vector is done by looking up
65 Vcharset_hash_table by the symbol.
67 Coding systems are classified into the following types depending on
68 the encoding mechanism. Here's a brief description of the types.
70 o UTF-8
72 o UTF-16
74 o Charset-base coding system
76 A coding system defined by one or more (coded) character sets.
77 Decoding and encoding are done by a code converter defined for each
78 character set.
80 o Old Emacs internal format (emacs-mule)
82 The coding system adopted by old versions of Emacs (20 and 21).
84 o ISO2022-base coding system
86 The most famous coding system for multiple character sets. X's
87 Compound Text, various EUCs (Extended Unix Code), and coding systems
88 used in the Internet communication such as ISO-2022-JP are all
89 variants of ISO2022.
91 o SJIS (or Shift-JIS or MS-Kanji-Code)
93 A coding system to encode character sets: ASCII, JISX0201, and
94 JISX0208. Widely used for PC's in Japan. Details are described in
95 section 8.
97 o BIG5
99 A coding system to encode character sets: ASCII and Big5. Widely
100 used for Chinese (mainly in Taiwan and Hong Kong). Details are
101 described in section 8. In this file, when we write "big5" (all
102 lowercase), we mean the coding system, and when we write "Big5"
103 (capitalized), we mean the character set.
105 o CCL
107 If a user wants to decode/encode text encoded in a coding system
108 not listed above, he can supply a decoder and an encoder for it in
109 CCL (Code Conversion Language) programs. Emacs executes the CCL
110 program while decoding/encoding.
112 o Raw-text
114 A coding system for text containing raw eight-bit data. Emacs
115 treats each byte of source text as a character (except for
116 end-of-line conversion).
118 o No-conversion
120 Like raw text, but don't do end-of-line conversion.
123 END-OF-LINE FORMAT
125 How text end-of-line is encoded depends on operating system. For
126 instance, Unix's format is just one byte of LF (line-feed) code,
127 whereas DOS's format is two-byte sequence of `carriage-return' and
128 `line-feed' codes. MacOS's format is usually one byte of
129 `carriage-return'.
131 Since text character encoding and end-of-line encoding are
132 independent, any coding system described above can take any format
133 of end-of-line (except for no-conversion).
135 STRUCT CODING_SYSTEM
137 Before using a coding system for code conversion (i.e. decoding and
138 encoding), we setup a structure of type `struct coding_system'.
139 This structure keeps various information about a specific code
140 conversion (e.g. the location of source and destination data).
144 /* COMMON MACROS */
147 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
149 These functions check if a byte sequence specified as a source in
150 CODING conforms to the format of XXX, and update the members of
151 DETECT_INFO.
153 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
155 Below is the template of these functions. */
157 #if 0
158 static int
159 detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding;
161 struct coding_detection_info *detect_info;
163 const unsigned char *src = coding->source;
164 const unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int consumed_chars = 0;
167 int found = 0;
168 ...;
170 while (1)
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
176 if (! __C_conforms_to_XXX___ (c))
177 break;
178 if (! __C_strongly_suggests_XXX__ (c))
179 found = CATEGORY_MASK_XXX;
181 /* The byte sequence is invalid for XXX. */
182 detect_info->rejected |= CATEGORY_MASK_XXX;
183 return 0;
185 no_more_source:
186 /* The source exausted successfully. */
187 detect_info->found |= found;
188 return 1;
190 #endif
192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
194 These functions decode a byte sequence specified as a source by
195 CODING. The resulting multibyte text goes to a place pointed to by
196 CODING->charbuf, the length of which should not exceed
197 CODING->charbuf_size;
199 These functions set the information of original and decoded texts in
200 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
201 They also set CODING->result to one of CODING_RESULT_XXX indicating
202 how the decoding is finished.
204 Below is the template of these functions. */
206 #if 0
207 static void
208 decode_coding_XXXX (coding)
209 struct coding_system *coding;
211 const unsigned char *src = coding->source + coding->consumed;
212 const unsigned char *src_end = coding->source + coding->src_bytes;
213 /* SRC_BASE remembers the start position in source in each loop.
214 The loop will be exited when there's not enough source code, or
215 when there's no room in CHARBUF for a decoded character. */
216 const unsigned char *src_base;
217 /* A buffer to produce decoded characters. */
218 int *charbuf = coding->charbuf + coding->charbuf_used;
219 int *charbuf_end = coding->charbuf + coding->charbuf_size;
220 int multibytep = coding->src_multibyte;
222 while (1)
224 src_base = src;
225 if (charbuf < charbuf_end)
226 /* No more room to produce a decoded character. */
227 break;
228 ONE_MORE_BYTE (c);
229 /* Decode it. */
232 no_more_source:
233 if (src_base < src_end
234 && coding->mode & CODING_MODE_LAST_BLOCK)
235 /* If the source ends by partial bytes to construct a character,
236 treat them as eight-bit raw data. */
237 while (src_base < src_end && charbuf < charbuf_end)
238 *charbuf++ = *src_base++;
239 /* Remember how many bytes and characters we consumed. If the
240 source is multibyte, the bytes and chars are not identical. */
241 coding->consumed = coding->consumed_char = src_base - coding->source;
242 /* Remember how many characters we produced. */
243 coding->charbuf_used = charbuf - coding->charbuf;
245 #endif
247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
249 These functions encode SRC_BYTES length text at SOURCE of Emacs'
250 internal multibyte format by CODING. The resulting byte sequence
251 goes to a place pointed to by DESTINATION, the length of which
252 should not exceed DST_BYTES.
254 These functions set the information of original and encoded texts in
255 the members produced, produced_char, consumed, and consumed_char of
256 the structure *CODING. They also set the member result to one of
257 CODING_RESULT_XXX indicating how the encoding finished.
259 DST_BYTES zero means that source area and destination area are
260 overlapped, which means that we can produce a encoded text until it
261 reaches at the head of not-yet-encoded source text.
263 Below is a template of these functions. */
264 #if 0
265 static void
266 encode_coding_XXX (coding)
267 struct coding_system *coding;
269 int multibytep = coding->dst_multibyte;
270 int *charbuf = coding->charbuf;
271 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
272 unsigned char *dst = coding->destination + coding->produced;
273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
274 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
275 int produced_chars = 0;
277 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
279 int c = *charbuf;
280 /* Encode C into DST, and increment DST. */
282 label_no_more_destination:
283 /* How many chars and bytes we produced. */
284 coding->produced_char += produced_chars;
285 coding->produced = dst - coding->destination;
287 #endif
290 /*** 1. Preamble ***/
292 #include <config.h>
293 #include <stdio.h>
295 #include "lisp.h"
296 #include "buffer.h"
297 #include "character.h"
298 #include "charset.h"
299 #include "ccl.h"
300 #include "composite.h"
301 #include "coding.h"
302 #include "window.h"
303 #include "frame.h"
304 #include "termhooks.h"
306 Lisp_Object Vcoding_system_hash_table;
308 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
309 Lisp_Object Qunix, Qdos;
310 extern Lisp_Object Qmac; /* frame.c */
311 Lisp_Object Qbuffer_file_coding_system;
312 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
313 Lisp_Object Qdefault_char;
314 Lisp_Object Qno_conversion, Qundecided;
315 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
316 Lisp_Object Qbig, Qlittle;
317 Lisp_Object Qcoding_system_history;
318 Lisp_Object Qvalid_codes;
319 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
320 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
321 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
322 Lisp_Object QCascii_compatible_p;
324 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 Lisp_Object Qtarget_idx;
329 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
330 Lisp_Object Qinterrupted, Qinsufficient_memory;
332 extern Lisp_Object Qcompletion_ignore_case;
334 /* If a symbol has this property, evaluate the value to define the
335 symbol as a coding system. */
336 static Lisp_Object Qcoding_system_define_form;
338 int coding_system_require_warning;
340 Lisp_Object Vselect_safe_coding_system_function;
342 /* Mnemonic string for each format of end-of-line. */
343 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
344 /* Mnemonic string to indicate format of end-of-line is not yet
345 decided. */
346 Lisp_Object eol_mnemonic_undecided;
348 /* Format of end-of-line decided by system. This is Qunix on
349 Unix and Mac, Qdos on DOS/Windows.
350 This has an effect only for external encoding (i.e. for output to
351 file and process), not for in-buffer or Lisp string encoding. */
352 static Lisp_Object system_eol_type;
354 #ifdef emacs
356 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
358 Lisp_Object Qcoding_system_p, Qcoding_system_error;
360 /* Coding system emacs-mule and raw-text are for converting only
361 end-of-line format. */
362 Lisp_Object Qemacs_mule, Qraw_text;
363 Lisp_Object Qutf_8_emacs;
365 /* Coding-systems are handed between Emacs Lisp programs and C internal
366 routines by the following three variables. */
367 /* Coding-system for reading files and receiving data from process. */
368 Lisp_Object Vcoding_system_for_read;
369 /* Coding-system for writing files and sending data to process. */
370 Lisp_Object Vcoding_system_for_write;
371 /* Coding-system actually used in the latest I/O. */
372 Lisp_Object Vlast_coding_system_used;
373 /* Set to non-nil when an error is detected while code conversion. */
374 Lisp_Object Vlast_code_conversion_error;
375 /* A vector of length 256 which contains information about special
376 Latin codes (especially for dealing with Microsoft codes). */
377 Lisp_Object Vlatin_extra_code_table;
379 /* Flag to inhibit code conversion of end-of-line format. */
380 int inhibit_eol_conversion;
382 /* Flag to inhibit ISO2022 escape sequence detection. */
383 int inhibit_iso_escape_detection;
385 /* Flag to make buffer-file-coding-system inherit from process-coding. */
386 int inherit_process_coding_system;
388 /* Coding system to be used to encode text for terminal display when
389 terminal coding system is nil. */
390 struct coding_system safe_terminal_coding;
392 Lisp_Object Vfile_coding_system_alist;
393 Lisp_Object Vprocess_coding_system_alist;
394 Lisp_Object Vnetwork_coding_system_alist;
396 Lisp_Object Vlocale_coding_system;
398 #endif /* emacs */
400 /* Flag to tell if we look up translation table on character code
401 conversion. */
402 Lisp_Object Venable_character_translation;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode;
408 Lisp_Object Qtranslation_table;
409 Lisp_Object Qtranslation_table_id;
410 Lisp_Object Qtranslation_table_for_decode;
411 Lisp_Object Qtranslation_table_for_encode;
413 /* Alist of charsets vs revision number. */
414 static Lisp_Object Vcharset_revision_table;
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system;
419 /* Char table for translating Quail and self-inserting input. */
420 Lisp_Object Vtranslation_table_for_input;
422 /* Two special coding systems. */
423 Lisp_Object Vsjis_coding_system;
424 Lisp_Object Vbig5_coding_system;
426 /* ISO2022 section */
428 #define CODING_ISO_INITIAL(coding, reg) \
429 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
430 coding_attr_iso_initial), \
431 reg)))
434 #define CODING_ISO_REQUEST(coding, charset_id) \
435 ((charset_id <= (coding)->max_charset_id \
436 ? (coding)->safe_charsets[charset_id] \
437 : -1))
440 #define CODING_ISO_FLAGS(coding) \
441 ((coding)->spec.iso_2022.flags)
442 #define CODING_ISO_DESIGNATION(coding, reg) \
443 ((coding)->spec.iso_2022.current_designation[reg])
444 #define CODING_ISO_INVOCATION(coding, plane) \
445 ((coding)->spec.iso_2022.current_invocation[plane])
446 #define CODING_ISO_SINGLE_SHIFTING(coding) \
447 ((coding)->spec.iso_2022.single_shifting)
448 #define CODING_ISO_BOL(coding) \
449 ((coding)->spec.iso_2022.bol)
450 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
451 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
453 /* Control characters of ISO2022. */
454 /* code */ /* function */
455 #define ISO_CODE_LF 0x0A /* line-feed */
456 #define ISO_CODE_CR 0x0D /* carriage-return */
457 #define ISO_CODE_SO 0x0E /* shift-out */
458 #define ISO_CODE_SI 0x0F /* shift-in */
459 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
460 #define ISO_CODE_ESC 0x1B /* escape */
461 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
462 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
463 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
465 /* All code (1-byte) of ISO2022 is classified into one of the
466 followings. */
467 enum iso_code_class_type
469 ISO_control_0, /* Control codes in the range
470 0x00..0x1F and 0x7F, except for the
471 following 5 codes. */
472 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
473 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
474 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
475 ISO_escape, /* ISO_CODE_SO (0x1B) */
476 ISO_control_1, /* Control codes in the range
477 0x80..0x9F, except for the
478 following 3 codes. */
479 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
480 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
481 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
482 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
483 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
484 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
485 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
488 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
489 `iso-flags' attribute of an iso2022 coding system. */
491 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
492 instead of the correct short-form sequence (e.g. ESC $ A). */
493 #define CODING_ISO_FLAG_LONG_FORM 0x0001
495 /* If set, reset graphic planes and registers at end-of-line to the
496 initial state. */
497 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
499 /* If set, reset graphic planes and registers before any control
500 characters to the initial state. */
501 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
503 /* If set, encode by 7-bit environment. */
504 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
506 /* If set, use locking-shift function. */
507 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
509 /* If set, use single-shift function. Overwrite
510 CODING_ISO_FLAG_LOCKING_SHIFT. */
511 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
513 /* If set, use designation escape sequence. */
514 #define CODING_ISO_FLAG_DESIGNATION 0x0040
516 /* If set, produce revision number sequence. */
517 #define CODING_ISO_FLAG_REVISION 0x0080
519 /* If set, produce ISO6429's direction specifying sequence. */
520 #define CODING_ISO_FLAG_DIRECTION 0x0100
522 /* If set, assume designation states are reset at beginning of line on
523 output. */
524 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
526 /* If set, designation sequence should be placed at beginning of line
527 on output. */
528 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
530 /* If set, do not encode unsafe charactes on output. */
531 #define CODING_ISO_FLAG_SAFE 0x0800
533 /* If set, extra latin codes (128..159) are accepted as a valid code
534 on input. */
535 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
537 #define CODING_ISO_FLAG_COMPOSITION 0x2000
539 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
541 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
543 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
545 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
547 /* A character to be produced on output if encoding of the original
548 character is prohibited by CODING_ISO_FLAG_SAFE. */
549 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
552 /* UTF-16 section */
553 #define CODING_UTF_16_BOM(coding) \
554 ((coding)->spec.utf_16.bom)
556 #define CODING_UTF_16_ENDIAN(coding) \
557 ((coding)->spec.utf_16.endian)
559 #define CODING_UTF_16_SURROGATE(coding) \
560 ((coding)->spec.utf_16.surrogate)
563 /* CCL section */
564 #define CODING_CCL_DECODER(coding) \
565 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
566 #define CODING_CCL_ENCODER(coding) \
567 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
568 #define CODING_CCL_VALIDS(coding) \
569 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
571 /* Index for each coding category in `coding_categories' */
573 enum coding_category
575 coding_category_iso_7,
576 coding_category_iso_7_tight,
577 coding_category_iso_8_1,
578 coding_category_iso_8_2,
579 coding_category_iso_7_else,
580 coding_category_iso_8_else,
581 coding_category_utf_8,
582 coding_category_utf_16_auto,
583 coding_category_utf_16_be,
584 coding_category_utf_16_le,
585 coding_category_utf_16_be_nosig,
586 coding_category_utf_16_le_nosig,
587 coding_category_charset,
588 coding_category_sjis,
589 coding_category_big5,
590 coding_category_ccl,
591 coding_category_emacs_mule,
592 /* All above are targets of code detection. */
593 coding_category_raw_text,
594 coding_category_undecided,
595 coding_category_max
598 /* Definitions of flag bits used in detect_coding_XXXX. */
599 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
600 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
601 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
602 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
603 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
604 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
605 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
606 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
607 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
608 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
609 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
610 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
611 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
612 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
613 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
614 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
615 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
616 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
618 /* This value is returned if detect_coding_mask () find nothing other
619 than ASCII characters. */
620 #define CATEGORY_MASK_ANY \
621 (CATEGORY_MASK_ISO_7 \
622 | CATEGORY_MASK_ISO_7_TIGHT \
623 | CATEGORY_MASK_ISO_8_1 \
624 | CATEGORY_MASK_ISO_8_2 \
625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_BE \
629 | CATEGORY_MASK_UTF_16_LE \
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \
631 | CATEGORY_MASK_UTF_16_LE_NOSIG \
632 | CATEGORY_MASK_CHARSET \
633 | CATEGORY_MASK_SJIS \
634 | CATEGORY_MASK_BIG5 \
635 | CATEGORY_MASK_CCL \
636 | CATEGORY_MASK_EMACS_MULE)
639 #define CATEGORY_MASK_ISO_7BIT \
640 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
642 #define CATEGORY_MASK_ISO_8BIT \
643 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
645 #define CATEGORY_MASK_ISO_ELSE \
646 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
648 #define CATEGORY_MASK_ISO_ESCAPE \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_7_ELSE \
652 | CATEGORY_MASK_ISO_8_ELSE)
654 #define CATEGORY_MASK_ISO \
655 ( CATEGORY_MASK_ISO_7BIT \
656 | CATEGORY_MASK_ISO_8BIT \
657 | CATEGORY_MASK_ISO_ELSE)
659 #define CATEGORY_MASK_UTF_16 \
660 (CATEGORY_MASK_UTF_16_BE \
661 | CATEGORY_MASK_UTF_16_LE \
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG)
666 /* List of symbols `coding-category-xxx' ordered by priority. This
667 variable is exposed to Emacs Lisp. */
668 static Lisp_Object Vcoding_category_list;
670 /* Table of coding categories (Lisp symbols). This variable is for
671 internal use oly. */
672 static Lisp_Object Vcoding_category_table;
674 /* Table of coding-categories ordered by priority. */
675 static enum coding_category coding_priorities[coding_category_max];
677 /* Nth element is a coding context for the coding system bound to the
678 Nth coding category. */
679 static struct coding_system coding_categories[coding_category_max];
681 /*** Commonly used macros and functions ***/
683 #ifndef min
684 #define min(a, b) ((a) < (b) ? (a) : (b))
685 #endif
686 #ifndef max
687 #define max(a, b) ((a) > (b) ? (a) : (b))
688 #endif
690 #define CODING_GET_INFO(coding, attrs, charset_list) \
691 do { \
692 (attrs) = CODING_ID_ATTRS ((coding)->id); \
693 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
694 } while (0)
697 /* Safely get one byte from the source text pointed by SRC which ends
698 at SRC_END, and set C to that byte. If there are not enough bytes
699 in the source, it jumps to `no_more_source'. If multibytep is
700 nonzero, and a multibyte character is found at SRC, set C to the
701 negative value of the character code. The caller should declare
702 and set these variables appropriately in advance:
703 src, src_end, multibytep */
705 #define ONE_MORE_BYTE(c) \
706 do { \
707 if (src == src_end) \
709 if (src_base < src) \
710 record_conversion_result \
711 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
712 goto no_more_source; \
714 c = *src++; \
715 if (multibytep && (c & 0x80)) \
717 if ((c & 0xFE) == 0xC0) \
718 c = ((c & 1) << 6) | *src++; \
719 else \
721 src--; \
722 c = - string_char (src, &src, NULL); \
723 record_conversion_result \
724 (coding, CODING_RESULT_INVALID_SRC); \
727 consumed_chars++; \
728 } while (0)
731 #define ONE_MORE_BYTE_NO_CHECK(c) \
732 do { \
733 c = *src++; \
734 if (multibytep && (c & 0x80)) \
736 if ((c & 0xFE) == 0xC0) \
737 c = ((c & 1) << 6) | *src++; \
738 else \
740 src--; \
741 c = - string_char (src, &src, NULL); \
742 record_conversion_result \
743 (coding, CODING_RESULT_INVALID_SRC); \
746 consumed_chars++; \
747 } while (0)
750 /* Store a byte C in the place pointed by DST and increment DST to the
751 next free point, and increment PRODUCED_CHARS. The caller should
752 assure that C is 0..127, and declare and set the variable `dst'
753 appropriately in advance.
757 #define EMIT_ONE_ASCII_BYTE(c) \
758 do { \
759 produced_chars++; \
760 *dst++ = (c); \
761 } while (0)
764 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
766 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
767 do { \
768 produced_chars += 2; \
769 *dst++ = (c1), *dst++ = (c2); \
770 } while (0)
773 /* Store a byte C in the place pointed by DST and increment DST to the
774 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
775 nonzero, store in an appropriate multibyte from. The caller should
776 declare and set the variables `dst' and `multibytep' appropriately
777 in advance. */
779 #define EMIT_ONE_BYTE(c) \
780 do { \
781 produced_chars++; \
782 if (multibytep) \
784 int ch = (c); \
785 if (ch >= 0x80) \
786 ch = BYTE8_TO_CHAR (ch); \
787 CHAR_STRING_ADVANCE (ch, dst); \
789 else \
790 *dst++ = (c); \
791 } while (0)
794 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
796 #define EMIT_TWO_BYTES(c1, c2) \
797 do { \
798 produced_chars += 2; \
799 if (multibytep) \
801 int ch; \
803 ch = (c1); \
804 if (ch >= 0x80) \
805 ch = BYTE8_TO_CHAR (ch); \
806 CHAR_STRING_ADVANCE (ch, dst); \
807 ch = (c2); \
808 if (ch >= 0x80) \
809 ch = BYTE8_TO_CHAR (ch); \
810 CHAR_STRING_ADVANCE (ch, dst); \
812 else \
814 *dst++ = (c1); \
815 *dst++ = (c2); \
817 } while (0)
820 #define EMIT_THREE_BYTES(c1, c2, c3) \
821 do { \
822 EMIT_ONE_BYTE (c1); \
823 EMIT_TWO_BYTES (c2, c3); \
824 } while (0)
827 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
828 do { \
829 EMIT_TWO_BYTES (c1, c2); \
830 EMIT_TWO_BYTES (c3, c4); \
831 } while (0)
834 /* Prototypes for static functions. */
835 static void record_conversion_result P_ ((struct coding_system *coding,
836 enum coding_result_code result));
837 static int detect_coding_utf_8 P_ ((struct coding_system *,
838 struct coding_detection_info *info));
839 static void decode_coding_utf_8 P_ ((struct coding_system *));
840 static int encode_coding_utf_8 P_ ((struct coding_system *));
842 static int detect_coding_utf_16 P_ ((struct coding_system *,
843 struct coding_detection_info *info));
844 static void decode_coding_utf_16 P_ ((struct coding_system *));
845 static int encode_coding_utf_16 P_ ((struct coding_system *));
847 static int detect_coding_iso_2022 P_ ((struct coding_system *,
848 struct coding_detection_info *info));
849 static void decode_coding_iso_2022 P_ ((struct coding_system *));
850 static int encode_coding_iso_2022 P_ ((struct coding_system *));
852 static int detect_coding_emacs_mule P_ ((struct coding_system *,
853 struct coding_detection_info *info));
854 static void decode_coding_emacs_mule P_ ((struct coding_system *));
855 static int encode_coding_emacs_mule P_ ((struct coding_system *));
857 static int detect_coding_sjis P_ ((struct coding_system *,
858 struct coding_detection_info *info));
859 static void decode_coding_sjis P_ ((struct coding_system *));
860 static int encode_coding_sjis P_ ((struct coding_system *));
862 static int detect_coding_big5 P_ ((struct coding_system *,
863 struct coding_detection_info *info));
864 static void decode_coding_big5 P_ ((struct coding_system *));
865 static int encode_coding_big5 P_ ((struct coding_system *));
867 static int detect_coding_ccl P_ ((struct coding_system *,
868 struct coding_detection_info *info));
869 static void decode_coding_ccl P_ ((struct coding_system *));
870 static int encode_coding_ccl P_ ((struct coding_system *));
872 static void decode_coding_raw_text P_ ((struct coding_system *));
873 static int encode_coding_raw_text P_ ((struct coding_system *));
875 static void coding_set_source P_ ((struct coding_system *));
876 static void coding_set_destination P_ ((struct coding_system *));
877 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
878 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
879 EMACS_INT, EMACS_INT));
880 static unsigned char *alloc_destination P_ ((struct coding_system *,
881 EMACS_INT, unsigned char *));
882 static void setup_iso_safe_charsets P_ ((Lisp_Object));
883 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
884 int *, int *,
885 unsigned char *));
886 static int detect_eol P_ ((const unsigned char *,
887 EMACS_INT, enum coding_category));
888 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
889 static void decode_eol P_ ((struct coding_system *));
890 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
891 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
892 int, int *, int *));
893 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
894 static INLINE void produce_composition P_ ((struct coding_system *, int *,
895 EMACS_INT));
896 static INLINE void produce_charset P_ ((struct coding_system *, int *,
897 EMACS_INT));
898 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
899 static int decode_coding P_ ((struct coding_system *));
900 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
901 struct coding_system *,
902 int *, EMACS_INT *));
903 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
904 struct coding_system *,
905 int *, EMACS_INT *));
906 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
907 static int encode_coding P_ ((struct coding_system *));
908 static Lisp_Object make_conversion_work_buffer P_ ((int));
909 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
910 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
911 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
913 static void
914 record_conversion_result (struct coding_system *coding,
915 enum coding_result_code result)
917 coding->result = result;
918 switch (result)
920 case CODING_RESULT_INSUFFICIENT_SRC:
921 Vlast_code_conversion_error = Qinsufficient_source;
922 break;
923 case CODING_RESULT_INCONSISTENT_EOL:
924 Vlast_code_conversion_error = Qinconsistent_eol;
925 break;
926 case CODING_RESULT_INVALID_SRC:
927 Vlast_code_conversion_error = Qinvalid_source;
928 break;
929 case CODING_RESULT_INTERRUPT:
930 Vlast_code_conversion_error = Qinterrupted;
931 break;
932 case CODING_RESULT_INSUFFICIENT_MEM:
933 Vlast_code_conversion_error = Qinsufficient_memory;
934 break;
935 default:
936 Vlast_code_conversion_error = intern ("Unknown error");
940 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
941 do { \
942 charset_map_loaded = 0; \
943 c = DECODE_CHAR (charset, code); \
944 if (charset_map_loaded) \
946 const unsigned char *orig = coding->source; \
947 EMACS_INT offset; \
949 coding_set_source (coding); \
950 offset = coding->source - orig; \
951 src += offset; \
952 src_base += offset; \
953 src_end += offset; \
955 } while (0)
958 /* If there are at least BYTES length of room at dst, allocate memory
959 for coding->destination and update dst and dst_end. We don't have
960 to take care of coding->source which will be relocated. It is
961 handled by calling coding_set_source in encode_coding. */
963 #define ASSURE_DESTINATION(bytes) \
964 do { \
965 if (dst + (bytes) >= dst_end) \
967 int more_bytes = charbuf_end - charbuf + (bytes); \
969 dst = alloc_destination (coding, more_bytes, dst); \
970 dst_end = coding->destination + coding->dst_bytes; \
972 } while (0)
975 /* Store multibyte form of the character C in P, and advance P to the
976 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
977 never calls MAYBE_UNIFY_CHAR. */
979 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
980 do { \
981 if ((c) <= MAX_1_BYTE_CHAR) \
982 *(p)++ = (c); \
983 else if ((c) <= MAX_2_BYTE_CHAR) \
984 *(p)++ = (0xC0 | ((c) >> 6)), \
985 *(p)++ = (0x80 | ((c) & 0x3F)); \
986 else if ((c) <= MAX_3_BYTE_CHAR) \
987 *(p)++ = (0xE0 | ((c) >> 12)), \
988 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
989 *(p)++ = (0x80 | ((c) & 0x3F)); \
990 else if ((c) <= MAX_4_BYTE_CHAR) \
991 *(p)++ = (0xF0 | (c >> 18)), \
992 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
993 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
994 *(p)++ = (0x80 | (c & 0x3F)); \
995 else if ((c) <= MAX_5_BYTE_CHAR) \
996 *(p)++ = 0xF8, \
997 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
998 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
999 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1000 *(p)++ = (0x80 | (c & 0x3F)); \
1001 else \
1002 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1003 } while (0)
1006 /* Return the character code of character whose multibyte form is at
1007 P, and advance P to the end of the multibyte form. This is like
1008 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1010 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1011 (!((p)[0] & 0x80) \
1012 ? *(p)++ \
1013 : ! ((p)[0] & 0x20) \
1014 ? ((p) += 2, \
1015 ((((p)[-2] & 0x1F) << 6) \
1016 | ((p)[-1] & 0x3F) \
1017 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1018 : ! ((p)[0] & 0x10) \
1019 ? ((p) += 3, \
1020 ((((p)[-3] & 0x0F) << 12) \
1021 | (((p)[-2] & 0x3F) << 6) \
1022 | ((p)[-1] & 0x3F))) \
1023 : ! ((p)[0] & 0x08) \
1024 ? ((p) += 4, \
1025 ((((p)[-4] & 0xF) << 18) \
1026 | (((p)[-3] & 0x3F) << 12) \
1027 | (((p)[-2] & 0x3F) << 6) \
1028 | ((p)[-1] & 0x3F))) \
1029 : ((p) += 5, \
1030 ((((p)[-4] & 0x3F) << 18) \
1031 | (((p)[-3] & 0x3F) << 12) \
1032 | (((p)[-2] & 0x3F) << 6) \
1033 | ((p)[-1] & 0x3F))))
1036 static void
1037 coding_set_source (coding)
1038 struct coding_system *coding;
1040 if (BUFFERP (coding->src_object))
1042 struct buffer *buf = XBUFFER (coding->src_object);
1044 if (coding->src_pos < 0)
1045 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1046 else
1047 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1049 else if (STRINGP (coding->src_object))
1051 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1053 else
1054 /* Otherwise, the source is C string and is never relocated
1055 automatically. Thus we don't have to update anything. */
1059 static void
1060 coding_set_destination (coding)
1061 struct coding_system *coding;
1063 if (BUFFERP (coding->dst_object))
1065 if (coding->src_pos < 0)
1067 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1068 coding->dst_bytes = (GAP_END_ADDR
1069 - (coding->src_bytes - coding->consumed)
1070 - coding->destination);
1072 else
1074 /* We are sure that coding->dst_pos_byte is before the gap
1075 of the buffer. */
1076 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1077 + coding->dst_pos_byte - BEG_BYTE);
1078 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079 - coding->destination);
1082 else
1083 /* Otherwise, the destination is C string and is never relocated
1084 automatically. Thus we don't have to update anything. */
1089 static void
1090 coding_alloc_by_realloc (coding, bytes)
1091 struct coding_system *coding;
1092 EMACS_INT bytes;
1094 coding->destination = (unsigned char *) xrealloc (coding->destination,
1095 coding->dst_bytes + bytes);
1096 coding->dst_bytes += bytes;
1099 static void
1100 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1101 struct coding_system *coding;
1102 EMACS_INT gap_head_used, bytes;
1104 if (EQ (coding->src_object, coding->dst_object))
1106 /* The gap may contain the produced data at the head and not-yet
1107 consumed data at the tail. To preserve those data, we at
1108 first make the gap size to zero, then increase the gap
1109 size. */
1110 EMACS_INT add = GAP_SIZE;
1112 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1113 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1114 make_gap (bytes);
1115 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1116 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1118 else
1120 Lisp_Object this_buffer;
1122 this_buffer = Fcurrent_buffer ();
1123 set_buffer_internal (XBUFFER (coding->dst_object));
1124 make_gap (bytes);
1125 set_buffer_internal (XBUFFER (this_buffer));
1130 static unsigned char *
1131 alloc_destination (coding, nbytes, dst)
1132 struct coding_system *coding;
1133 EMACS_INT nbytes;
1134 unsigned char *dst;
1136 EMACS_INT offset = dst - coding->destination;
1138 if (BUFFERP (coding->dst_object))
1140 struct buffer *buf = XBUFFER (coding->dst_object);
1142 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1144 else
1145 coding_alloc_by_realloc (coding, nbytes);
1146 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1147 coding_set_destination (coding);
1148 dst = coding->destination + offset;
1149 return dst;
1152 /** Macros for annotations. */
1154 /* Maximum length of annotation data (sum of annotations for
1155 composition and charset). */
1156 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1158 /* An annotation data is stored in the array coding->charbuf in this
1159 format:
1160 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1161 LENGTH is the number of elements in the annotation.
1162 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1163 NCHARS is the number of characters in the text annotated.
1165 The format of the following elements depend on ANNOTATION_MASK.
1167 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1168 follows:
1169 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1170 METHOD is one of enum composition_method.
1171 Optionnal COMPOSITION-COMPONENTS are characters and composition
1172 rules.
1174 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1175 follows. */
1177 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1178 do { \
1179 *(buf)++ = -(len); \
1180 *(buf)++ = (mask); \
1181 *(buf)++ = (nchars); \
1182 coding->annotated = 1; \
1183 } while (0);
1185 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1186 do { \
1187 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1188 *buf++ = method; \
1189 } while (0)
1192 #define ADD_CHARSET_DATA(buf, nchars, id) \
1193 do { \
1194 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1195 *buf++ = id; \
1196 } while (0)
1199 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1204 /*** 3. UTF-8 ***/
1206 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1207 Check if a text is encoded in UTF-8. If it is, return 1, else
1208 return 0. */
1210 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1211 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1212 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1213 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1214 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1215 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1217 static int
1218 detect_coding_utf_8 (coding, detect_info)
1219 struct coding_system *coding;
1220 struct coding_detection_info *detect_info;
1222 const unsigned char *src = coding->source, *src_base;
1223 const unsigned char *src_end = coding->source + coding->src_bytes;
1224 int multibytep = coding->src_multibyte;
1225 int consumed_chars = 0;
1226 int found = 0;
1228 detect_info->checked |= CATEGORY_MASK_UTF_8;
1229 /* A coding system of this category is always ASCII compatible. */
1230 src += coding->head_ascii;
1232 while (1)
1234 int c, c1, c2, c3, c4;
1236 src_base = src;
1237 ONE_MORE_BYTE (c);
1238 if (c < 0 || UTF_8_1_OCTET_P (c))
1239 continue;
1240 ONE_MORE_BYTE (c1);
1241 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1242 break;
1243 if (UTF_8_2_OCTET_LEADING_P (c))
1245 found = CATEGORY_MASK_UTF_8;
1246 continue;
1248 ONE_MORE_BYTE (c2);
1249 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1250 break;
1251 if (UTF_8_3_OCTET_LEADING_P (c))
1253 found = CATEGORY_MASK_UTF_8;
1254 continue;
1256 ONE_MORE_BYTE (c3);
1257 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1258 break;
1259 if (UTF_8_4_OCTET_LEADING_P (c))
1261 found = CATEGORY_MASK_UTF_8;
1262 continue;
1264 ONE_MORE_BYTE (c4);
1265 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1266 break;
1267 if (UTF_8_5_OCTET_LEADING_P (c))
1269 found = CATEGORY_MASK_UTF_8;
1270 continue;
1272 break;
1274 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1275 return 0;
1277 no_more_source:
1278 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1280 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1281 return 0;
1283 detect_info->found |= found;
1284 return 1;
1288 static void
1289 decode_coding_utf_8 (coding)
1290 struct coding_system *coding;
1292 const unsigned char *src = coding->source + coding->consumed;
1293 const unsigned char *src_end = coding->source + coding->src_bytes;
1294 const unsigned char *src_base;
1295 int *charbuf = coding->charbuf + coding->charbuf_used;
1296 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297 int consumed_chars = 0, consumed_chars_base;
1298 int multibytep = coding->src_multibyte;
1299 Lisp_Object attr, charset_list;
1300 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1301 int byte_after_cr = -1;
1303 CODING_GET_INFO (coding, attr, charset_list);
1305 while (1)
1307 int c, c1, c2, c3, c4, c5;
1309 src_base = src;
1310 consumed_chars_base = consumed_chars;
1312 if (charbuf >= charbuf_end)
1313 break;
1315 if (byte_after_cr >= 0)
1316 c1 = byte_after_cr, byte_after_cr = -1;
1317 else
1318 ONE_MORE_BYTE (c1);
1319 if (c1 < 0)
1321 c = - c1;
1323 else if (UTF_8_1_OCTET_P(c1))
1325 if (eol_crlf && c1 == '\r')
1326 ONE_MORE_BYTE (byte_after_cr);
1327 c = c1;
1329 else
1331 ONE_MORE_BYTE (c2);
1332 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1333 goto invalid_code;
1334 if (UTF_8_2_OCTET_LEADING_P (c1))
1336 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1337 /* Reject overlong sequences here and below. Encoders
1338 producing them are incorrect, they can be misleading,
1339 and they mess up read/write invariance. */
1340 if (c < 128)
1341 goto invalid_code;
1343 else
1345 ONE_MORE_BYTE (c3);
1346 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1347 goto invalid_code;
1348 if (UTF_8_3_OCTET_LEADING_P (c1))
1350 c = (((c1 & 0xF) << 12)
1351 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1352 if (c < 0x800
1353 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1354 goto invalid_code;
1356 else
1358 ONE_MORE_BYTE (c4);
1359 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1360 goto invalid_code;
1361 if (UTF_8_4_OCTET_LEADING_P (c1))
1363 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1364 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1365 if (c < 0x10000)
1366 goto invalid_code;
1368 else
1370 ONE_MORE_BYTE (c5);
1371 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1372 goto invalid_code;
1373 if (UTF_8_5_OCTET_LEADING_P (c1))
1375 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1376 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1377 | (c5 & 0x3F));
1378 if ((c > MAX_CHAR) || (c < 0x200000))
1379 goto invalid_code;
1381 else
1382 goto invalid_code;
1388 *charbuf++ = c;
1389 continue;
1391 invalid_code:
1392 src = src_base;
1393 consumed_chars = consumed_chars_base;
1394 ONE_MORE_BYTE (c);
1395 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1396 coding->errors++;
1399 no_more_source:
1400 coding->consumed_char += consumed_chars_base;
1401 coding->consumed = src_base - coding->source;
1402 coding->charbuf_used = charbuf - coding->charbuf;
1406 static int
1407 encode_coding_utf_8 (coding)
1408 struct coding_system *coding;
1410 int multibytep = coding->dst_multibyte;
1411 int *charbuf = coding->charbuf;
1412 int *charbuf_end = charbuf + coding->charbuf_used;
1413 unsigned char *dst = coding->destination + coding->produced;
1414 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1415 int produced_chars = 0;
1416 int c;
1418 if (multibytep)
1420 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1422 while (charbuf < charbuf_end)
1424 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1426 ASSURE_DESTINATION (safe_room);
1427 c = *charbuf++;
1428 if (CHAR_BYTE8_P (c))
1430 c = CHAR_TO_BYTE8 (c);
1431 EMIT_ONE_BYTE (c);
1433 else
1435 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1436 for (p = str; p < pend; p++)
1437 EMIT_ONE_BYTE (*p);
1441 else
1443 int safe_room = MAX_MULTIBYTE_LENGTH;
1445 while (charbuf < charbuf_end)
1447 ASSURE_DESTINATION (safe_room);
1448 c = *charbuf++;
1449 if (CHAR_BYTE8_P (c))
1450 *dst++ = CHAR_TO_BYTE8 (c);
1451 else
1452 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1453 produced_chars++;
1456 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1457 coding->produced_char += produced_chars;
1458 coding->produced = dst - coding->destination;
1459 return 0;
1463 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1464 Check if a text is encoded in one of UTF-16 based coding systems.
1465 If it is, return 1, else return 0. */
1467 #define UTF_16_HIGH_SURROGATE_P(val) \
1468 (((val) & 0xFC00) == 0xD800)
1470 #define UTF_16_LOW_SURROGATE_P(val) \
1471 (((val) & 0xFC00) == 0xDC00)
1473 #define UTF_16_INVALID_P(val) \
1474 (((val) == 0xFFFE) \
1475 || ((val) == 0xFFFF) \
1476 || UTF_16_LOW_SURROGATE_P (val))
1479 static int
1480 detect_coding_utf_16 (coding, detect_info)
1481 struct coding_system *coding;
1482 struct coding_detection_info *detect_info;
1484 const unsigned char *src = coding->source, *src_base = src;
1485 const unsigned char *src_end = coding->source + coding->src_bytes;
1486 int multibytep = coding->src_multibyte;
1487 int consumed_chars = 0;
1488 int c1, c2;
1490 detect_info->checked |= CATEGORY_MASK_UTF_16;
1491 if (coding->mode & CODING_MODE_LAST_BLOCK
1492 && (coding->src_chars & 1))
1494 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495 return 0;
1498 ONE_MORE_BYTE (c1);
1499 ONE_MORE_BYTE (c2);
1500 if ((c1 == 0xFF) && (c2 == 0xFE))
1502 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1503 | CATEGORY_MASK_UTF_16_AUTO);
1504 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1505 | CATEGORY_MASK_UTF_16_BE_NOSIG
1506 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1508 else if ((c1 == 0xFE) && (c2 == 0xFF))
1510 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1511 | CATEGORY_MASK_UTF_16_AUTO);
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1513 | CATEGORY_MASK_UTF_16_BE_NOSIG
1514 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1516 else if (c1 >= 0 && c2 >= 0)
1518 detect_info->rejected
1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1521 no_more_source:
1522 return 1;
1525 static void
1526 decode_coding_utf_16 (coding)
1527 struct coding_system *coding;
1529 const unsigned char *src = coding->source + coding->consumed;
1530 const unsigned char *src_end = coding->source + coding->src_bytes;
1531 const unsigned char *src_base;
1532 int *charbuf = coding->charbuf + coding->charbuf_used;
1533 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1534 int consumed_chars = 0, consumed_chars_base;
1535 int multibytep = coding->src_multibyte;
1536 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1537 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1538 int surrogate = CODING_UTF_16_SURROGATE (coding);
1539 Lisp_Object attr, charset_list;
1540 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1541 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1543 CODING_GET_INFO (coding, attr, charset_list);
1545 if (bom == utf_16_with_bom)
1547 int c, c1, c2;
1549 src_base = src;
1550 ONE_MORE_BYTE (c1);
1551 ONE_MORE_BYTE (c2);
1552 c = (c1 << 8) | c2;
1554 if (endian == utf_16_big_endian
1555 ? c != 0xFEFF : c != 0xFFFE)
1557 /* The first two bytes are not BOM. Treat them as bytes
1558 for a normal character. */
1559 src = src_base;
1560 coding->errors++;
1562 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1564 else if (bom == utf_16_detect_bom)
1566 /* We have already tried to detect BOM and failed in
1567 detect_coding. */
1568 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1571 while (1)
1573 int c, c1, c2;
1575 src_base = src;
1576 consumed_chars_base = consumed_chars;
1578 if (charbuf + 2 >= charbuf_end)
1579 break;
1581 if (byte_after_cr1 >= 0)
1582 c1 = byte_after_cr1, byte_after_cr1 = -1;
1583 else
1584 ONE_MORE_BYTE (c1);
1585 if (c1 < 0)
1587 *charbuf++ = -c1;
1588 continue;
1590 if (byte_after_cr2 >= 0)
1591 c2 = byte_after_cr2, byte_after_cr2 = -1;
1592 else
1593 ONE_MORE_BYTE (c2);
1594 if (c2 < 0)
1596 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1597 *charbuf++ = -c2;
1598 continue;
1600 c = (endian == utf_16_big_endian
1601 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1603 if (surrogate)
1605 if (! UTF_16_LOW_SURROGATE_P (c))
1607 if (endian == utf_16_big_endian)
1608 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1609 else
1610 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1611 *charbuf++ = c1;
1612 *charbuf++ = c2;
1613 coding->errors++;
1614 if (UTF_16_HIGH_SURROGATE_P (c))
1615 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1616 else
1617 *charbuf++ = c;
1619 else
1621 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1622 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1623 *charbuf++ = 0x10000 + c;
1626 else
1628 if (UTF_16_HIGH_SURROGATE_P (c))
1629 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1630 else
1632 if (eol_crlf && c == '\r')
1634 ONE_MORE_BYTE (byte_after_cr1);
1635 ONE_MORE_BYTE (byte_after_cr2);
1637 *charbuf++ = c;
1642 no_more_source:
1643 coding->consumed_char += consumed_chars_base;
1644 coding->consumed = src_base - coding->source;
1645 coding->charbuf_used = charbuf - coding->charbuf;
1648 static int
1649 encode_coding_utf_16 (coding)
1650 struct coding_system *coding;
1652 int multibytep = coding->dst_multibyte;
1653 int *charbuf = coding->charbuf;
1654 int *charbuf_end = charbuf + coding->charbuf_used;
1655 unsigned char *dst = coding->destination + coding->produced;
1656 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1657 int safe_room = 8;
1658 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1659 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1660 int produced_chars = 0;
1661 Lisp_Object attrs, charset_list;
1662 int c;
1664 CODING_GET_INFO (coding, attrs, charset_list);
1666 if (bom != utf_16_without_bom)
1668 ASSURE_DESTINATION (safe_room);
1669 if (big_endian)
1670 EMIT_TWO_BYTES (0xFE, 0xFF);
1671 else
1672 EMIT_TWO_BYTES (0xFF, 0xFE);
1673 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1676 while (charbuf < charbuf_end)
1678 ASSURE_DESTINATION (safe_room);
1679 c = *charbuf++;
1680 if (c >= MAX_UNICODE_CHAR)
1681 c = coding->default_char;
1683 if (c < 0x10000)
1685 if (big_endian)
1686 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1687 else
1688 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1690 else
1692 int c1, c2;
1694 c -= 0x10000;
1695 c1 = (c >> 10) + 0xD800;
1696 c2 = (c & 0x3FF) + 0xDC00;
1697 if (big_endian)
1698 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1699 else
1700 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1703 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1704 coding->produced = dst - coding->destination;
1705 coding->produced_char += produced_chars;
1706 return 0;
1710 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1712 /* Emacs' internal format for representation of multiple character
1713 sets is a kind of multi-byte encoding, i.e. characters are
1714 represented by variable-length sequences of one-byte codes.
1716 ASCII characters and control characters (e.g. `tab', `newline') are
1717 represented by one-byte sequences which are their ASCII codes, in
1718 the range 0x00 through 0x7F.
1720 8-bit characters of the range 0x80..0x9F are represented by
1721 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1722 code + 0x20).
1724 8-bit characters of the range 0xA0..0xFF are represented by
1725 one-byte sequences which are their 8-bit code.
1727 The other characters are represented by a sequence of `base
1728 leading-code', optional `extended leading-code', and one or two
1729 `position-code's. The length of the sequence is determined by the
1730 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1731 whereas extended leading-code and position-code take the range 0xA0
1732 through 0xFF. See `charset.h' for more details about leading-code
1733 and position-code.
1735 --- CODE RANGE of Emacs' internal format ---
1736 character set range
1737 ------------- -----
1738 ascii 0x00..0x7F
1739 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1740 eight-bit-graphic 0xA0..0xBF
1741 ELSE 0x81..0x9D + [0xA0..0xFF]+
1742 ---------------------------------------------
1744 As this is the internal character representation, the format is
1745 usually not used externally (i.e. in a file or in a data sent to a
1746 process). But, it is possible to have a text externally in this
1747 format (i.e. by encoding by the coding system `emacs-mule').
1749 In that case, a sequence of one-byte codes has a slightly different
1750 form.
1752 At first, all characters in eight-bit-control are represented by
1753 one-byte sequences which are their 8-bit code.
1755 Next, character composition data are represented by the byte
1756 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1757 where,
1758 METHOD is 0xF0 plus one of composition method (enum
1759 composition_method),
1761 BYTES is 0xA0 plus a byte length of this composition data,
1763 CHARS is 0x20 plus a number of characters composed by this
1764 data,
1766 COMPONENTs are characters of multibye form or composition
1767 rules encoded by two-byte of ASCII codes.
1769 In addition, for backward compatibility, the following formats are
1770 also recognized as composition data on decoding.
1772 0x80 MSEQ ...
1773 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1775 Here,
1776 MSEQ is a multibyte form but in these special format:
1777 ASCII: 0xA0 ASCII_CODE+0x80,
1778 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1779 RULE is a one byte code of the range 0xA0..0xF0 that
1780 represents a composition rule.
1783 char emacs_mule_bytes[256];
1786 emacs_mule_char (coding, src, nbytes, nchars, id)
1787 struct coding_system *coding;
1788 const unsigned char *src;
1789 int *nbytes, *nchars, *id;
1791 const unsigned char *src_end = coding->source + coding->src_bytes;
1792 const unsigned char *src_base = src;
1793 int multibytep = coding->src_multibyte;
1794 struct charset *charset;
1795 unsigned code;
1796 int c;
1797 int consumed_chars = 0;
1799 ONE_MORE_BYTE (c);
1800 if (c < 0)
1802 c = -c;
1803 charset = emacs_mule_charset[0];
1805 else
1807 if (c >= 0xA0)
1809 /* Old style component character of a composition. */
1810 if (c == 0xA0)
1812 ONE_MORE_BYTE (c);
1813 c -= 0x80;
1815 else
1816 c -= 0x20;
1819 switch (emacs_mule_bytes[c])
1821 case 2:
1822 if (! (charset = emacs_mule_charset[c]))
1823 goto invalid_code;
1824 ONE_MORE_BYTE (c);
1825 if (c < 0xA0)
1826 goto invalid_code;
1827 code = c & 0x7F;
1828 break;
1830 case 3:
1831 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1832 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1834 ONE_MORE_BYTE (c);
1835 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1836 goto invalid_code;
1837 ONE_MORE_BYTE (c);
1838 if (c < 0xA0)
1839 goto invalid_code;
1840 code = c & 0x7F;
1842 else
1844 if (! (charset = emacs_mule_charset[c]))
1845 goto invalid_code;
1846 ONE_MORE_BYTE (c);
1847 if (c < 0xA0)
1848 goto invalid_code;
1849 code = (c & 0x7F) << 8;
1850 ONE_MORE_BYTE (c);
1851 if (c < 0xA0)
1852 goto invalid_code;
1853 code |= c & 0x7F;
1855 break;
1857 case 4:
1858 ONE_MORE_BYTE (c);
1859 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1860 goto invalid_code;
1861 ONE_MORE_BYTE (c);
1862 if (c < 0xA0)
1863 goto invalid_code;
1864 code = (c & 0x7F) << 8;
1865 ONE_MORE_BYTE (c);
1866 if (c < 0xA0)
1867 goto invalid_code;
1868 code |= c & 0x7F;
1869 break;
1871 case 1:
1872 code = c;
1873 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1874 ? charset_ascii : charset_eight_bit);
1875 break;
1877 default:
1878 abort ();
1880 c = DECODE_CHAR (charset, code);
1881 if (c < 0)
1882 goto invalid_code;
1884 *nbytes = src - src_base;
1885 *nchars = consumed_chars;
1886 if (id)
1887 *id = charset->id;
1888 return c;
1890 no_more_source:
1891 return -2;
1893 invalid_code:
1894 return -1;
1898 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1899 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1900 else return 0. */
1902 static int
1903 detect_coding_emacs_mule (coding, detect_info)
1904 struct coding_system *coding;
1905 struct coding_detection_info *detect_info;
1907 const unsigned char *src = coding->source, *src_base;
1908 const unsigned char *src_end = coding->source + coding->src_bytes;
1909 int multibytep = coding->src_multibyte;
1910 int consumed_chars = 0;
1911 int c;
1912 int found = 0;
1914 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1915 /* A coding system of this category is always ASCII compatible. */
1916 src += coding->head_ascii;
1918 while (1)
1920 src_base = src;
1921 ONE_MORE_BYTE (c);
1922 if (c < 0)
1923 continue;
1924 if (c == 0x80)
1926 /* Perhaps the start of composite character. We simple skip
1927 it because analyzing it is too heavy for detecting. But,
1928 at least, we check that the composite character
1929 constitues of more than 4 bytes. */
1930 const unsigned char *src_base;
1932 repeat:
1933 src_base = src;
1936 ONE_MORE_BYTE (c);
1938 while (c >= 0xA0);
1940 if (src - src_base <= 4)
1941 break;
1942 found = CATEGORY_MASK_EMACS_MULE;
1943 if (c == 0x80)
1944 goto repeat;
1947 if (c < 0x80)
1949 if (c < 0x20
1950 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1951 break;
1953 else
1955 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1957 while (more_bytes > 0)
1959 ONE_MORE_BYTE (c);
1960 if (c < 0xA0)
1962 src--; /* Unread the last byte. */
1963 break;
1965 more_bytes--;
1967 if (more_bytes != 0)
1968 break;
1969 found = CATEGORY_MASK_EMACS_MULE;
1972 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1973 return 0;
1975 no_more_source:
1976 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1978 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1979 return 0;
1981 detect_info->found |= found;
1982 return 1;
1986 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1988 /* Decode a character represented as a component of composition
1989 sequence of Emacs 20/21 style at SRC. Set C to that character and
1990 update SRC to the head of next character (or an encoded composition
1991 rule). If SRC doesn't points a composition component, set C to -1.
1992 If SRC points an invalid byte sequence, global exit by a return
1993 value 0. */
1995 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1996 do \
1998 int c; \
1999 int nbytes, nchars; \
2001 if (src == src_end) \
2002 break; \
2003 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2004 if (c < 0) \
2006 if (c == -2) \
2007 break; \
2008 goto invalid_code; \
2010 *buf++ = c; \
2011 src += nbytes; \
2012 consumed_chars += nchars; \
2014 while (0)
2017 /* Decode a composition rule represented as a component of composition
2018 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2019 and increment BUF. If SRC points an invalid byte sequence, set C
2020 to -1. */
2022 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
2023 do { \
2024 int c, gref, nref; \
2026 if (src >= src_end) \
2027 goto invalid_code; \
2028 ONE_MORE_BYTE_NO_CHECK (c); \
2029 c -= 0xA0; \
2030 if (c < 0 || c >= 81) \
2031 goto invalid_code; \
2033 gref = c / 9, nref = c % 9; \
2034 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2035 } while (0)
2038 /* Decode a composition rule represented as a component of composition
2039 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2040 and increment BUF. If SRC points an invalid byte sequence, set C
2041 to -1. */
2043 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2044 do { \
2045 int gref, nref; \
2047 if (src + 1>= src_end) \
2048 goto invalid_code; \
2049 ONE_MORE_BYTE_NO_CHECK (gref); \
2050 gref -= 0x20; \
2051 ONE_MORE_BYTE_NO_CHECK (nref); \
2052 nref -= 0x20; \
2053 if (gref < 0 || gref >= 81 \
2054 || nref < 0 || nref >= 81) \
2055 goto invalid_code; \
2056 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2057 } while (0)
2060 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
2061 do { \
2062 /* Emacs 21 style format. The first three bytes at SRC are \
2063 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
2064 the byte length of this composition information, CHARS is the \
2065 number of characters composed by this composition. */ \
2066 enum composition_method method = c - 0xF2; \
2067 int *charbuf_base = charbuf; \
2068 int consumed_chars_limit; \
2069 int nbytes, nchars; \
2071 ONE_MORE_BYTE (c); \
2072 if (c < 0) \
2073 goto invalid_code; \
2074 nbytes = c - 0xA0; \
2075 if (nbytes < 3) \
2076 goto invalid_code; \
2077 ONE_MORE_BYTE (c); \
2078 if (c < 0) \
2079 goto invalid_code; \
2080 nchars = c - 0xA0; \
2081 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2082 consumed_chars_limit = consumed_chars_base + nbytes; \
2083 if (method != COMPOSITION_RELATIVE) \
2085 int i = 0; \
2086 while (consumed_chars < consumed_chars_limit) \
2088 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
2089 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
2090 else \
2091 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
2092 i++; \
2094 if (consumed_chars < consumed_chars_limit) \
2095 goto invalid_code; \
2096 charbuf_base[0] -= i; \
2098 } while (0)
2101 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2102 do { \
2103 /* Emacs 20 style format for relative composition. */ \
2104 /* Store multibyte form of characters to be composed. */ \
2105 enum composition_method method = COMPOSITION_RELATIVE; \
2106 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2107 int *buf = components; \
2108 int i, j; \
2110 src = src_base; \
2111 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2112 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2113 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2114 if (i < 2) \
2115 goto invalid_code; \
2116 ADD_COMPOSITION_DATA (charbuf, i, method); \
2117 for (j = 0; j < i; j++) \
2118 *charbuf++ = components[j]; \
2119 } while (0)
2122 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2123 do { \
2124 /* Emacs 20 style format for rule-base composition. */ \
2125 /* Store multibyte form of characters to be composed. */ \
2126 enum composition_method method = COMPOSITION_WITH_RULE; \
2127 int *charbuf_base = charbuf; \
2128 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2129 int *buf = components; \
2130 int i, j; \
2132 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2133 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2135 if (*src < 0xA0) \
2136 break; \
2137 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2138 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2140 if (i <= 1 || (buf - components) % 2 == 0) \
2141 goto invalid_code; \
2142 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2143 goto no_more_source; \
2144 ADD_COMPOSITION_DATA (charbuf, i, method); \
2145 i = i * 2 - 1; \
2146 for (j = 0; j < i; j++) \
2147 *charbuf++ = components[j]; \
2148 charbuf_base[0] -= i; \
2149 for (j = 0; j < i; j += 2) \
2150 *charbuf++ = components[j]; \
2151 } while (0)
2154 static void
2155 decode_coding_emacs_mule (coding)
2156 struct coding_system *coding;
2158 const unsigned char *src = coding->source + coding->consumed;
2159 const unsigned char *src_end = coding->source + coding->src_bytes;
2160 const unsigned char *src_base;
2161 int *charbuf = coding->charbuf + coding->charbuf_used;
2162 int *charbuf_end
2163 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2164 int consumed_chars = 0, consumed_chars_base;
2165 int multibytep = coding->src_multibyte;
2166 Lisp_Object attrs, charset_list;
2167 int char_offset = coding->produced_char;
2168 int last_offset = char_offset;
2169 int last_id = charset_ascii;
2170 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2171 int byte_after_cr = -1;
2173 CODING_GET_INFO (coding, attrs, charset_list);
2175 while (1)
2177 int c;
2179 src_base = src;
2180 consumed_chars_base = consumed_chars;
2182 if (charbuf >= charbuf_end)
2183 break;
2185 if (byte_after_cr >= 0)
2186 c = byte_after_cr, byte_after_cr = -1;
2187 else
2188 ONE_MORE_BYTE (c);
2189 if (c < 0)
2191 *charbuf++ = -c;
2192 char_offset++;
2194 else if (c < 0x80)
2196 if (eol_crlf && c == '\r')
2197 ONE_MORE_BYTE (byte_after_cr);
2198 *charbuf++ = c;
2199 char_offset++;
2201 else if (c == 0x80)
2203 ONE_MORE_BYTE (c);
2204 if (c < 0)
2205 goto invalid_code;
2206 if (c - 0xF2 >= COMPOSITION_RELATIVE
2207 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2208 DECODE_EMACS_MULE_21_COMPOSITION (c);
2209 else if (c < 0xC0)
2210 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2211 else if (c == 0xFF)
2212 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2213 else
2214 goto invalid_code;
2216 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2218 int nbytes, nchars;
2219 int id;
2221 src = src_base;
2222 consumed_chars = consumed_chars_base;
2223 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2224 if (c < 0)
2226 if (c == -2)
2227 break;
2228 goto invalid_code;
2230 if (last_id != id)
2232 if (last_id != charset_ascii)
2233 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2234 last_id = id;
2235 last_offset = char_offset;
2237 *charbuf++ = c;
2238 src += nbytes;
2239 consumed_chars += nchars;
2240 char_offset++;
2242 else
2243 goto invalid_code;
2244 continue;
2246 invalid_code:
2247 src = src_base;
2248 consumed_chars = consumed_chars_base;
2249 ONE_MORE_BYTE (c);
2250 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2251 char_offset++;
2252 coding->errors++;
2255 no_more_source:
2256 if (last_id != charset_ascii)
2257 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2258 coding->consumed_char += consumed_chars_base;
2259 coding->consumed = src_base - coding->source;
2260 coding->charbuf_used = charbuf - coding->charbuf;
2264 #define EMACS_MULE_LEADING_CODES(id, codes) \
2265 do { \
2266 if (id < 0xA0) \
2267 codes[0] = id, codes[1] = 0; \
2268 else if (id < 0xE0) \
2269 codes[0] = 0x9A, codes[1] = id; \
2270 else if (id < 0xF0) \
2271 codes[0] = 0x9B, codes[1] = id; \
2272 else if (id < 0xF5) \
2273 codes[0] = 0x9C, codes[1] = id; \
2274 else \
2275 codes[0] = 0x9D, codes[1] = id; \
2276 } while (0);
2279 static int
2280 encode_coding_emacs_mule (coding)
2281 struct coding_system *coding;
2283 int multibytep = coding->dst_multibyte;
2284 int *charbuf = coding->charbuf;
2285 int *charbuf_end = charbuf + coding->charbuf_used;
2286 unsigned char *dst = coding->destination + coding->produced;
2287 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2288 int safe_room = 8;
2289 int produced_chars = 0;
2290 Lisp_Object attrs, charset_list;
2291 int c;
2292 int preferred_charset_id = -1;
2294 CODING_GET_INFO (coding, attrs, charset_list);
2295 if (! EQ (charset_list, Vemacs_mule_charset_list))
2297 CODING_ATTR_CHARSET_LIST (attrs)
2298 = charset_list = Vemacs_mule_charset_list;
2301 while (charbuf < charbuf_end)
2303 ASSURE_DESTINATION (safe_room);
2304 c = *charbuf++;
2306 if (c < 0)
2308 /* Handle an annotation. */
2309 switch (*charbuf)
2311 case CODING_ANNOTATE_COMPOSITION_MASK:
2312 /* Not yet implemented. */
2313 break;
2314 case CODING_ANNOTATE_CHARSET_MASK:
2315 preferred_charset_id = charbuf[3];
2316 if (preferred_charset_id >= 0
2317 && NILP (Fmemq (make_number (preferred_charset_id),
2318 charset_list)))
2319 preferred_charset_id = -1;
2320 break;
2321 default:
2322 abort ();
2324 charbuf += -c - 1;
2325 continue;
2328 if (ASCII_CHAR_P (c))
2329 EMIT_ONE_ASCII_BYTE (c);
2330 else if (CHAR_BYTE8_P (c))
2332 c = CHAR_TO_BYTE8 (c);
2333 EMIT_ONE_BYTE (c);
2335 else
2337 struct charset *charset;
2338 unsigned code;
2339 int dimension;
2340 int emacs_mule_id;
2341 unsigned char leading_codes[2];
2343 if (preferred_charset_id >= 0)
2345 charset = CHARSET_FROM_ID (preferred_charset_id);
2346 if (! CHAR_CHARSET_P (c, charset))
2347 charset = char_charset (c, charset_list, NULL);
2349 else
2350 charset = char_charset (c, charset_list, &code);
2351 if (! charset)
2353 c = coding->default_char;
2354 if (ASCII_CHAR_P (c))
2356 EMIT_ONE_ASCII_BYTE (c);
2357 continue;
2359 charset = char_charset (c, charset_list, &code);
2361 dimension = CHARSET_DIMENSION (charset);
2362 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2363 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2364 EMIT_ONE_BYTE (leading_codes[0]);
2365 if (leading_codes[1])
2366 EMIT_ONE_BYTE (leading_codes[1]);
2367 if (dimension == 1)
2368 EMIT_ONE_BYTE (code | 0x80);
2369 else
2371 code |= 0x8080;
2372 EMIT_ONE_BYTE (code >> 8);
2373 EMIT_ONE_BYTE (code & 0xFF);
2377 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2378 coding->produced_char += produced_chars;
2379 coding->produced = dst - coding->destination;
2380 return 0;
2384 /*** 7. ISO2022 handlers ***/
2386 /* The following note describes the coding system ISO2022 briefly.
2387 Since the intention of this note is to help understand the
2388 functions in this file, some parts are NOT ACCURATE or are OVERLY
2389 SIMPLIFIED. For thorough understanding, please refer to the
2390 original document of ISO2022. This is equivalent to the standard
2391 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2393 ISO2022 provides many mechanisms to encode several character sets
2394 in 7-bit and 8-bit environments. For 7-bit environments, all text
2395 is encoded using bytes less than 128. This may make the encoded
2396 text a little bit longer, but the text passes more easily through
2397 several types of gateway, some of which strip off the MSB (Most
2398 Significant Bit).
2400 There are two kinds of character sets: control character sets and
2401 graphic character sets. The former contain control characters such
2402 as `newline' and `escape' to provide control functions (control
2403 functions are also provided by escape sequences). The latter
2404 contain graphic characters such as 'A' and '-'. Emacs recognizes
2405 two control character sets and many graphic character sets.
2407 Graphic character sets are classified into one of the following
2408 four classes, according to the number of bytes (DIMENSION) and
2409 number of characters in one dimension (CHARS) of the set:
2410 - DIMENSION1_CHARS94
2411 - DIMENSION1_CHARS96
2412 - DIMENSION2_CHARS94
2413 - DIMENSION2_CHARS96
2415 In addition, each character set is assigned an identification tag,
2416 unique for each set, called the "final character" (denoted as <F>
2417 hereafter). The <F> of each character set is decided by ECMA(*)
2418 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2419 (0x30..0x3F are for private use only).
2421 Note (*): ECMA = European Computer Manufacturers Association
2423 Here are examples of graphic character sets [NAME(<F>)]:
2424 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2425 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2426 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2427 o DIMENSION2_CHARS96 -- none for the moment
2429 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2430 C0 [0x00..0x1F] -- control character plane 0
2431 GL [0x20..0x7F] -- graphic character plane 0
2432 C1 [0x80..0x9F] -- control character plane 1
2433 GR [0xA0..0xFF] -- graphic character plane 1
2435 A control character set is directly designated and invoked to C0 or
2436 C1 by an escape sequence. The most common case is that:
2437 - ISO646's control character set is designated/invoked to C0, and
2438 - ISO6429's control character set is designated/invoked to C1,
2439 and usually these designations/invocations are omitted in encoded
2440 text. In a 7-bit environment, only C0 can be used, and a control
2441 character for C1 is encoded by an appropriate escape sequence to
2442 fit into the environment. All control characters for C1 are
2443 defined to have corresponding escape sequences.
2445 A graphic character set is at first designated to one of four
2446 graphic registers (G0 through G3), then these graphic registers are
2447 invoked to GL or GR. These designations and invocations can be
2448 done independently. The most common case is that G0 is invoked to
2449 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2450 these invocations and designations are omitted in encoded text.
2451 In a 7-bit environment, only GL can be used.
2453 When a graphic character set of CHARS94 is invoked to GL, codes
2454 0x20 and 0x7F of the GL area work as control characters SPACE and
2455 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2456 be used.
2458 There are two ways of invocation: locking-shift and single-shift.
2459 With locking-shift, the invocation lasts until the next different
2460 invocation, whereas with single-shift, the invocation affects the
2461 following character only and doesn't affect the locking-shift
2462 state. Invocations are done by the following control characters or
2463 escape sequences:
2465 ----------------------------------------------------------------------
2466 abbrev function cntrl escape seq description
2467 ----------------------------------------------------------------------
2468 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2469 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2470 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2471 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2472 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2473 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2474 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2475 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2476 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2477 ----------------------------------------------------------------------
2478 (*) These are not used by any known coding system.
2480 Control characters for these functions are defined by macros
2481 ISO_CODE_XXX in `coding.h'.
2483 Designations are done by the following escape sequences:
2484 ----------------------------------------------------------------------
2485 escape sequence description
2486 ----------------------------------------------------------------------
2487 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2488 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2489 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2490 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2491 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2492 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2493 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2494 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2495 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2496 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2497 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2498 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2499 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2500 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2501 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2502 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2503 ----------------------------------------------------------------------
2505 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2506 of dimension 1, chars 94, and final character <F>, etc...
2508 Note (*): Although these designations are not allowed in ISO2022,
2509 Emacs accepts them on decoding, and produces them on encoding
2510 CHARS96 character sets in a coding system which is characterized as
2511 7-bit environment, non-locking-shift, and non-single-shift.
2513 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2514 '(' must be omitted. We refer to this as "short-form" hereafter.
2516 Now you may notice that there are a lot of ways of encoding the
2517 same multilingual text in ISO2022. Actually, there exist many
2518 coding systems such as Compound Text (used in X11's inter client
2519 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2520 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2521 localized platforms), and all of these are variants of ISO2022.
2523 In addition to the above, Emacs handles two more kinds of escape
2524 sequences: ISO6429's direction specification and Emacs' private
2525 sequence for specifying character composition.
2527 ISO6429's direction specification takes the following form:
2528 o CSI ']' -- end of the current direction
2529 o CSI '0' ']' -- end of the current direction
2530 o CSI '1' ']' -- start of left-to-right text
2531 o CSI '2' ']' -- start of right-to-left text
2532 The control character CSI (0x9B: control sequence introducer) is
2533 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2535 Character composition specification takes the following form:
2536 o ESC '0' -- start relative composition
2537 o ESC '1' -- end composition
2538 o ESC '2' -- start rule-base composition (*)
2539 o ESC '3' -- start relative composition with alternate chars (**)
2540 o ESC '4' -- start rule-base composition with alternate chars (**)
2541 Since these are not standard escape sequences of any ISO standard,
2542 the use of them with these meanings is restricted to Emacs only.
2544 (*) This form is used only in Emacs 20.7 and older versions,
2545 but newer versions can safely decode it.
2546 (**) This form is used only in Emacs 21.1 and newer versions,
2547 and older versions can't decode it.
2549 Here's a list of example usages of these composition escape
2550 sequences (categorized by `enum composition_method').
2552 COMPOSITION_RELATIVE:
2553 ESC 0 CHAR [ CHAR ] ESC 1
2554 COMPOSITION_WITH_RULE:
2555 ESC 2 CHAR [ RULE CHAR ] ESC 1
2556 COMPOSITION_WITH_ALTCHARS:
2557 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2558 COMPOSITION_WITH_RULE_ALTCHARS:
2559 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2561 enum iso_code_class_type iso_code_class[256];
2563 #define SAFE_CHARSET_P(coding, id) \
2564 ((id) <= (coding)->max_charset_id \
2565 && (coding)->safe_charsets[id] >= 0)
2568 #define SHIFT_OUT_OK(category) \
2569 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2571 static void
2572 setup_iso_safe_charsets (attrs)
2573 Lisp_Object attrs;
2575 Lisp_Object charset_list, safe_charsets;
2576 Lisp_Object request;
2577 Lisp_Object reg_usage;
2578 Lisp_Object tail;
2579 int reg94, reg96;
2580 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2581 int max_charset_id;
2583 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2584 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2585 && ! EQ (charset_list, Viso_2022_charset_list))
2587 CODING_ATTR_CHARSET_LIST (attrs)
2588 = charset_list = Viso_2022_charset_list;
2589 ASET (attrs, coding_attr_safe_charsets, Qnil);
2592 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2593 return;
2595 max_charset_id = 0;
2596 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2598 int id = XINT (XCAR (tail));
2599 if (max_charset_id < id)
2600 max_charset_id = id;
2603 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2604 make_number (255));
2605 request = AREF (attrs, coding_attr_iso_request);
2606 reg_usage = AREF (attrs, coding_attr_iso_usage);
2607 reg94 = XINT (XCAR (reg_usage));
2608 reg96 = XINT (XCDR (reg_usage));
2610 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2612 Lisp_Object id;
2613 Lisp_Object reg;
2614 struct charset *charset;
2616 id = XCAR (tail);
2617 charset = CHARSET_FROM_ID (XINT (id));
2618 reg = Fcdr (Fassq (id, request));
2619 if (! NILP (reg))
2620 SSET (safe_charsets, XINT (id), XINT (reg));
2621 else if (charset->iso_chars_96)
2623 if (reg96 < 4)
2624 SSET (safe_charsets, XINT (id), reg96);
2626 else
2628 if (reg94 < 4)
2629 SSET (safe_charsets, XINT (id), reg94);
2632 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2637 Check if a text is encoded in one of ISO-2022 based codig systems.
2638 If it is, return 1, else return 0. */
2640 static int
2641 detect_coding_iso_2022 (coding, detect_info)
2642 struct coding_system *coding;
2643 struct coding_detection_info *detect_info;
2645 const unsigned char *src = coding->source, *src_base = src;
2646 const unsigned char *src_end = coding->source + coding->src_bytes;
2647 int multibytep = coding->src_multibyte;
2648 int single_shifting = 0;
2649 int id;
2650 int c, c1;
2651 int consumed_chars = 0;
2652 int i;
2653 int rejected = 0;
2654 int found = 0;
2656 detect_info->checked |= CATEGORY_MASK_ISO;
2658 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2660 struct coding_system *this = &(coding_categories[i]);
2661 Lisp_Object attrs, val;
2663 attrs = CODING_ID_ATTRS (this->id);
2664 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2665 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2666 setup_iso_safe_charsets (attrs);
2667 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2668 this->max_charset_id = SCHARS (val) - 1;
2669 this->safe_charsets = (char *) SDATA (val);
2672 /* A coding system of this category is always ASCII compatible. */
2673 src += coding->head_ascii;
2675 while (rejected != CATEGORY_MASK_ISO)
2677 src_base = src;
2678 ONE_MORE_BYTE (c);
2679 switch (c)
2681 case ISO_CODE_ESC:
2682 if (inhibit_iso_escape_detection)
2683 break;
2684 single_shifting = 0;
2685 ONE_MORE_BYTE (c);
2686 if (c >= '(' && c <= '/')
2688 /* Designation sequence for a charset of dimension 1. */
2689 ONE_MORE_BYTE (c1);
2690 if (c1 < ' ' || c1 >= 0x80
2691 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2692 /* Invalid designation sequence. Just ignore. */
2693 break;
2695 else if (c == '$')
2697 /* Designation sequence for a charset of dimension 2. */
2698 ONE_MORE_BYTE (c);
2699 if (c >= '@' && c <= 'B')
2700 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2701 id = iso_charset_table[1][0][c];
2702 else if (c >= '(' && c <= '/')
2704 ONE_MORE_BYTE (c1);
2705 if (c1 < ' ' || c1 >= 0x80
2706 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2707 /* Invalid designation sequence. Just ignore. */
2708 break;
2710 else
2711 /* Invalid designation sequence. Just ignore it. */
2712 break;
2714 else if (c == 'N' || c == 'O')
2716 /* ESC <Fe> for SS2 or SS3. */
2717 single_shifting = 1;
2718 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2719 break;
2721 else if (c >= '0' && c <= '4')
2723 /* ESC <Fp> for start/end composition. */
2724 found |= CATEGORY_MASK_ISO;
2725 break;
2727 else
2729 /* Invalid escape sequence. Just ignore it. */
2730 break;
2733 /* We found a valid designation sequence for CHARSET. */
2734 rejected |= CATEGORY_MASK_ISO_8BIT;
2735 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2736 id))
2737 found |= CATEGORY_MASK_ISO_7;
2738 else
2739 rejected |= CATEGORY_MASK_ISO_7;
2740 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2741 id))
2742 found |= CATEGORY_MASK_ISO_7_TIGHT;
2743 else
2744 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2745 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2746 id))
2747 found |= CATEGORY_MASK_ISO_7_ELSE;
2748 else
2749 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2750 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2751 id))
2752 found |= CATEGORY_MASK_ISO_8_ELSE;
2753 else
2754 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2755 break;
2757 case ISO_CODE_SO:
2758 case ISO_CODE_SI:
2759 /* Locking shift out/in. */
2760 if (inhibit_iso_escape_detection)
2761 break;
2762 single_shifting = 0;
2763 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2764 break;
2766 case ISO_CODE_CSI:
2767 /* Control sequence introducer. */
2768 single_shifting = 0;
2769 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2770 found |= CATEGORY_MASK_ISO_8_ELSE;
2771 goto check_extra_latin;
2773 case ISO_CODE_SS2:
2774 case ISO_CODE_SS3:
2775 /* Single shift. */
2776 if (inhibit_iso_escape_detection)
2777 break;
2778 single_shifting = 0;
2779 rejected |= CATEGORY_MASK_ISO_7BIT;
2780 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2781 & CODING_ISO_FLAG_SINGLE_SHIFT)
2782 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2783 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2784 & CODING_ISO_FLAG_SINGLE_SHIFT)
2785 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2786 if (single_shifting)
2787 break;
2788 goto check_extra_latin;
2790 default:
2791 if (c < 0)
2792 continue;
2793 if (c < 0x80)
2795 single_shifting = 0;
2796 break;
2798 if (c >= 0xA0)
2800 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2801 found |= CATEGORY_MASK_ISO_8_1;
2802 /* Check the length of succeeding codes of the range
2803 0xA0..0FF. If the byte length is even, we include
2804 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2805 only when we are not single shifting. */
2806 if (! single_shifting
2807 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2809 int i = 1;
2810 while (src < src_end)
2812 ONE_MORE_BYTE (c);
2813 if (c < 0xA0)
2814 break;
2815 i++;
2818 if (i & 1 && src < src_end)
2819 rejected |= CATEGORY_MASK_ISO_8_2;
2820 else
2821 found |= CATEGORY_MASK_ISO_8_2;
2823 break;
2825 check_extra_latin:
2826 single_shifting = 0;
2827 if (! VECTORP (Vlatin_extra_code_table)
2828 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2830 rejected = CATEGORY_MASK_ISO;
2831 break;
2833 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2834 & CODING_ISO_FLAG_LATIN_EXTRA)
2835 found |= CATEGORY_MASK_ISO_8_1;
2836 else
2837 rejected |= CATEGORY_MASK_ISO_8_1;
2838 rejected |= CATEGORY_MASK_ISO_8_2;
2841 detect_info->rejected |= CATEGORY_MASK_ISO;
2842 return 0;
2844 no_more_source:
2845 detect_info->rejected |= rejected;
2846 detect_info->found |= (found & ~rejected);
2847 return 1;
2851 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2852 escape sequence should be kept. */
2853 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2854 do { \
2855 int id, prev; \
2857 if (final < '0' || final >= 128 \
2858 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2859 || !SAFE_CHARSET_P (coding, id)) \
2861 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2862 chars_96 = -1; \
2863 break; \
2865 prev = CODING_ISO_DESIGNATION (coding, reg); \
2866 if (id == charset_jisx0201_roman) \
2868 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2869 id = charset_ascii; \
2871 else if (id == charset_jisx0208_1978) \
2873 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2874 id = charset_jisx0208; \
2876 CODING_ISO_DESIGNATION (coding, reg) = id; \
2877 /* If there was an invalid designation to REG previously, and this \
2878 designation is ASCII to REG, we should keep this designation \
2879 sequence. */ \
2880 if (prev == -2 && id == charset_ascii) \
2881 chars_96 = -1; \
2882 } while (0)
2885 #define MAYBE_FINISH_COMPOSITION() \
2886 do { \
2887 int i; \
2888 if (composition_state == COMPOSING_NO) \
2889 break; \
2890 /* It is assured that we have enough room for producing \
2891 characters stored in the table `components'. */ \
2892 if (charbuf + component_idx > charbuf_end) \
2893 goto no_more_source; \
2894 composition_state = COMPOSING_NO; \
2895 if (method == COMPOSITION_RELATIVE \
2896 || method == COMPOSITION_WITH_ALTCHARS) \
2898 for (i = 0; i < component_idx; i++) \
2899 *charbuf++ = components[i]; \
2900 char_offset += component_idx; \
2902 else \
2904 for (i = 0; i < component_idx; i += 2) \
2905 *charbuf++ = components[i]; \
2906 char_offset += (component_idx / 2) + 1; \
2908 } while (0)
2911 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2912 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2913 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2914 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2915 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2918 #define DECODE_COMPOSITION_START(c1) \
2919 do { \
2920 if (c1 == '0' \
2921 && composition_state == COMPOSING_COMPONENT_RULE) \
2923 component_len = component_idx; \
2924 composition_state = COMPOSING_CHAR; \
2926 else \
2928 const unsigned char *p; \
2930 MAYBE_FINISH_COMPOSITION (); \
2931 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2932 goto no_more_source; \
2933 for (p = src; p < src_end - 1; p++) \
2934 if (*p == ISO_CODE_ESC && p[1] == '1') \
2935 break; \
2936 if (p == src_end - 1) \
2938 /* The current composition doesn't end in the current \
2939 source. */ \
2940 record_conversion_result \
2941 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2942 goto no_more_source; \
2945 /* This is surely the start of a composition. */ \
2946 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2947 : c1 == '2' ? COMPOSITION_WITH_RULE \
2948 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2949 : COMPOSITION_WITH_RULE_ALTCHARS); \
2950 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2951 : COMPOSING_COMPONENT_CHAR); \
2952 component_idx = component_len = 0; \
2954 } while (0)
2957 /* Handle compositoin end sequence ESC 1. */
2959 #define DECODE_COMPOSITION_END() \
2960 do { \
2961 int nchars = (component_len > 0 ? component_idx - component_len \
2962 : method == COMPOSITION_RELATIVE ? component_idx \
2963 : (component_idx + 1) / 2); \
2964 int i; \
2965 int *saved_charbuf = charbuf; \
2967 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2968 if (method != COMPOSITION_RELATIVE) \
2970 if (component_len == 0) \
2971 for (i = 0; i < component_idx; i++) \
2972 *charbuf++ = components[i]; \
2973 else \
2974 for (i = 0; i < component_len; i++) \
2975 *charbuf++ = components[i]; \
2976 *saved_charbuf = saved_charbuf - charbuf; \
2978 if (method == COMPOSITION_WITH_RULE) \
2979 for (i = 0; i < component_idx; i += 2, char_offset++) \
2980 *charbuf++ = components[i]; \
2981 else \
2982 for (i = component_len; i < component_idx; i++, char_offset++) \
2983 *charbuf++ = components[i]; \
2984 coding->annotated = 1; \
2985 composition_state = COMPOSING_NO; \
2986 } while (0)
2989 /* Decode a composition rule from the byte C1 (and maybe one more byte
2990 from SRC) and store one encoded composition rule in
2991 coding->cmp_data. */
2993 #define DECODE_COMPOSITION_RULE(c1) \
2994 do { \
2995 (c1) -= 32; \
2996 if (c1 < 81) /* old format (before ver.21) */ \
2998 int gref = (c1) / 9; \
2999 int nref = (c1) % 9; \
3000 if (gref == 4) gref = 10; \
3001 if (nref == 4) nref = 10; \
3002 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
3004 else if (c1 < 93) /* new format (after ver.21) */ \
3006 ONE_MORE_BYTE (c2); \
3007 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
3009 else \
3010 c1 = 0; \
3011 } while (0)
3014 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3016 static void
3017 decode_coding_iso_2022 (coding)
3018 struct coding_system *coding;
3020 const unsigned char *src = coding->source + coding->consumed;
3021 const unsigned char *src_end = coding->source + coding->src_bytes;
3022 const unsigned char *src_base;
3023 int *charbuf = coding->charbuf + coding->charbuf_used;
3024 int *charbuf_end
3025 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3026 int consumed_chars = 0, consumed_chars_base;
3027 int multibytep = coding->src_multibyte;
3028 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3029 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3030 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3031 int charset_id_2, charset_id_3;
3032 struct charset *charset;
3033 int c;
3034 /* For handling composition sequence. */
3035 #define COMPOSING_NO 0
3036 #define COMPOSING_CHAR 1
3037 #define COMPOSING_RULE 2
3038 #define COMPOSING_COMPONENT_CHAR 3
3039 #define COMPOSING_COMPONENT_RULE 4
3041 int composition_state = COMPOSING_NO;
3042 enum composition_method method;
3043 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3044 int component_idx;
3045 int component_len;
3046 Lisp_Object attrs, charset_list;
3047 int char_offset = coding->produced_char;
3048 int last_offset = char_offset;
3049 int last_id = charset_ascii;
3050 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3051 int byte_after_cr = -1;
3053 CODING_GET_INFO (coding, attrs, charset_list);
3054 setup_iso_safe_charsets (attrs);
3055 /* Charset list may have been changed. */
3056 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3057 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3059 while (1)
3061 int c1, c2;
3063 src_base = src;
3064 consumed_chars_base = consumed_chars;
3066 if (charbuf >= charbuf_end)
3067 break;
3069 if (byte_after_cr >= 0)
3070 c1 = byte_after_cr, byte_after_cr = -1;
3071 else
3072 ONE_MORE_BYTE (c1);
3073 if (c1 < 0)
3074 goto invalid_code;
3076 /* We produce at most one character. */
3077 switch (iso_code_class [c1])
3079 case ISO_0x20_or_0x7F:
3080 if (composition_state != COMPOSING_NO)
3082 if (composition_state == COMPOSING_RULE
3083 || composition_state == COMPOSING_COMPONENT_RULE)
3085 DECODE_COMPOSITION_RULE (c1);
3086 components[component_idx++] = c1;
3087 composition_state--;
3088 continue;
3091 if (charset_id_0 < 0
3092 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3093 /* This is SPACE or DEL. */
3094 charset = CHARSET_FROM_ID (charset_ascii);
3095 else
3096 charset = CHARSET_FROM_ID (charset_id_0);
3097 break;
3099 case ISO_graphic_plane_0:
3100 if (composition_state != COMPOSING_NO)
3102 if (composition_state == COMPOSING_RULE
3103 || composition_state == COMPOSING_COMPONENT_RULE)
3105 DECODE_COMPOSITION_RULE (c1);
3106 components[component_idx++] = c1;
3107 composition_state--;
3108 continue;
3111 if (charset_id_0 < 0)
3112 charset = CHARSET_FROM_ID (charset_ascii);
3113 else
3114 charset = CHARSET_FROM_ID (charset_id_0);
3115 break;
3117 case ISO_0xA0_or_0xFF:
3118 if (charset_id_1 < 0
3119 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3120 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3121 goto invalid_code;
3122 /* This is a graphic character, we fall down ... */
3124 case ISO_graphic_plane_1:
3125 if (charset_id_1 < 0)
3126 goto invalid_code;
3127 charset = CHARSET_FROM_ID (charset_id_1);
3128 break;
3130 case ISO_control_0:
3131 if (eol_crlf && c1 == '\r')
3132 ONE_MORE_BYTE (byte_after_cr);
3133 MAYBE_FINISH_COMPOSITION ();
3134 charset = CHARSET_FROM_ID (charset_ascii);
3135 break;
3137 case ISO_control_1:
3138 MAYBE_FINISH_COMPOSITION ();
3139 goto invalid_code;
3141 case ISO_shift_out:
3142 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3143 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3144 goto invalid_code;
3145 CODING_ISO_INVOCATION (coding, 0) = 1;
3146 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3147 continue;
3149 case ISO_shift_in:
3150 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3151 goto invalid_code;
3152 CODING_ISO_INVOCATION (coding, 0) = 0;
3153 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3154 continue;
3156 case ISO_single_shift_2_7:
3157 case ISO_single_shift_2:
3158 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3159 goto invalid_code;
3160 /* SS2 is handled as an escape sequence of ESC 'N' */
3161 c1 = 'N';
3162 goto label_escape_sequence;
3164 case ISO_single_shift_3:
3165 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3166 goto invalid_code;
3167 /* SS2 is handled as an escape sequence of ESC 'O' */
3168 c1 = 'O';
3169 goto label_escape_sequence;
3171 case ISO_control_sequence_introducer:
3172 /* CSI is handled as an escape sequence of ESC '[' ... */
3173 c1 = '[';
3174 goto label_escape_sequence;
3176 case ISO_escape:
3177 ONE_MORE_BYTE (c1);
3178 label_escape_sequence:
3179 /* Escape sequences handled here are invocation,
3180 designation, direction specification, and character
3181 composition specification. */
3182 switch (c1)
3184 case '&': /* revision of following character set */
3185 ONE_MORE_BYTE (c1);
3186 if (!(c1 >= '@' && c1 <= '~'))
3187 goto invalid_code;
3188 ONE_MORE_BYTE (c1);
3189 if (c1 != ISO_CODE_ESC)
3190 goto invalid_code;
3191 ONE_MORE_BYTE (c1);
3192 goto label_escape_sequence;
3194 case '$': /* designation of 2-byte character set */
3195 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3196 goto invalid_code;
3198 int reg, chars96;
3200 ONE_MORE_BYTE (c1);
3201 if (c1 >= '@' && c1 <= 'B')
3202 { /* designation of JISX0208.1978, GB2312.1980,
3203 or JISX0208.1980 */
3204 reg = 0, chars96 = 0;
3206 else if (c1 >= 0x28 && c1 <= 0x2B)
3207 { /* designation of DIMENSION2_CHARS94 character set */
3208 reg = c1 - 0x28, chars96 = 0;
3209 ONE_MORE_BYTE (c1);
3211 else if (c1 >= 0x2C && c1 <= 0x2F)
3212 { /* designation of DIMENSION2_CHARS96 character set */
3213 reg = c1 - 0x2C, chars96 = 1;
3214 ONE_MORE_BYTE (c1);
3216 else
3217 goto invalid_code;
3218 DECODE_DESIGNATION (reg, 2, chars96, c1);
3219 /* We must update these variables now. */
3220 if (reg == 0)
3221 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3222 else if (reg == 1)
3223 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3224 if (chars96 < 0)
3225 goto invalid_code;
3227 continue;
3229 case 'n': /* invocation of locking-shift-2 */
3230 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3231 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3232 goto invalid_code;
3233 CODING_ISO_INVOCATION (coding, 0) = 2;
3234 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3235 continue;
3237 case 'o': /* invocation of locking-shift-3 */
3238 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3239 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3240 goto invalid_code;
3241 CODING_ISO_INVOCATION (coding, 0) = 3;
3242 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3243 continue;
3245 case 'N': /* invocation of single-shift-2 */
3246 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3247 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3248 goto invalid_code;
3249 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3250 if (charset_id_2 < 0)
3251 charset = CHARSET_FROM_ID (charset_ascii);
3252 else
3253 charset = CHARSET_FROM_ID (charset_id_2);
3254 ONE_MORE_BYTE (c1);
3255 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3256 goto invalid_code;
3257 break;
3259 case 'O': /* invocation of single-shift-3 */
3260 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3261 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3262 goto invalid_code;
3263 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3264 if (charset_id_3 < 0)
3265 charset = CHARSET_FROM_ID (charset_ascii);
3266 else
3267 charset = CHARSET_FROM_ID (charset_id_3);
3268 ONE_MORE_BYTE (c1);
3269 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3270 goto invalid_code;
3271 break;
3273 case '0': case '2': case '3': case '4': /* start composition */
3274 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3275 goto invalid_code;
3276 DECODE_COMPOSITION_START (c1);
3277 continue;
3279 case '1': /* end composition */
3280 if (composition_state == COMPOSING_NO)
3281 goto invalid_code;
3282 DECODE_COMPOSITION_END ();
3283 continue;
3285 case '[': /* specification of direction */
3286 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3287 goto invalid_code;
3288 /* For the moment, nested direction is not supported.
3289 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3290 left-to-right, and nozero means right-to-left. */
3291 ONE_MORE_BYTE (c1);
3292 switch (c1)
3294 case ']': /* end of the current direction */
3295 coding->mode &= ~CODING_MODE_DIRECTION;
3297 case '0': /* end of the current direction */
3298 case '1': /* start of left-to-right direction */
3299 ONE_MORE_BYTE (c1);
3300 if (c1 == ']')
3301 coding->mode &= ~CODING_MODE_DIRECTION;
3302 else
3303 goto invalid_code;
3304 break;
3306 case '2': /* start of right-to-left direction */
3307 ONE_MORE_BYTE (c1);
3308 if (c1 == ']')
3309 coding->mode |= CODING_MODE_DIRECTION;
3310 else
3311 goto invalid_code;
3312 break;
3314 default:
3315 goto invalid_code;
3317 continue;
3319 case '%':
3320 ONE_MORE_BYTE (c1);
3321 if (c1 == '/')
3323 /* CTEXT extended segment:
3324 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3325 We keep these bytes as is for the moment.
3326 They may be decoded by post-read-conversion. */
3327 int dim, M, L;
3328 int size;
3330 ONE_MORE_BYTE (dim);
3331 ONE_MORE_BYTE (M);
3332 ONE_MORE_BYTE (L);
3333 size = ((M - 128) * 128) + (L - 128);
3334 if (charbuf + 8 + size > charbuf_end)
3335 goto break_loop;
3336 *charbuf++ = ISO_CODE_ESC;
3337 *charbuf++ = '%';
3338 *charbuf++ = '/';
3339 *charbuf++ = dim;
3340 *charbuf++ = BYTE8_TO_CHAR (M);
3341 *charbuf++ = BYTE8_TO_CHAR (L);
3342 while (size-- > 0)
3344 ONE_MORE_BYTE (c1);
3345 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3348 else if (c1 == 'G')
3350 /* XFree86 extension for embedding UTF-8 in CTEXT:
3351 ESC % G --UTF-8-BYTES-- ESC % @
3352 We keep these bytes as is for the moment.
3353 They may be decoded by post-read-conversion. */
3354 int *p = charbuf;
3356 if (p + 6 > charbuf_end)
3357 goto break_loop;
3358 *p++ = ISO_CODE_ESC;
3359 *p++ = '%';
3360 *p++ = 'G';
3361 while (p < charbuf_end)
3363 ONE_MORE_BYTE (c1);
3364 if (c1 == ISO_CODE_ESC
3365 && src + 1 < src_end
3366 && src[0] == '%'
3367 && src[1] == '@')
3369 src += 2;
3370 break;
3372 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3374 if (p + 3 > charbuf_end)
3375 goto break_loop;
3376 *p++ = ISO_CODE_ESC;
3377 *p++ = '%';
3378 *p++ = '@';
3379 charbuf = p;
3381 else
3382 goto invalid_code;
3383 continue;
3384 break;
3386 default:
3387 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3388 goto invalid_code;
3390 int reg, chars96;
3392 if (c1 >= 0x28 && c1 <= 0x2B)
3393 { /* designation of DIMENSION1_CHARS94 character set */
3394 reg = c1 - 0x28, chars96 = 0;
3395 ONE_MORE_BYTE (c1);
3397 else if (c1 >= 0x2C && c1 <= 0x2F)
3398 { /* designation of DIMENSION1_CHARS96 character set */
3399 reg = c1 - 0x2C, chars96 = 1;
3400 ONE_MORE_BYTE (c1);
3402 else
3403 goto invalid_code;
3404 DECODE_DESIGNATION (reg, 1, chars96, c1);
3405 /* We must update these variables now. */
3406 if (reg == 0)
3407 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3408 else if (reg == 1)
3409 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3410 if (chars96 < 0)
3411 goto invalid_code;
3413 continue;
3417 if (charset->id != charset_ascii
3418 && last_id != charset->id)
3420 if (last_id != charset_ascii)
3421 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3422 last_id = charset->id;
3423 last_offset = char_offset;
3426 /* Now we know CHARSET and 1st position code C1 of a character.
3427 Produce a decoded character while getting 2nd position code
3428 C2 if necessary. */
3429 c1 &= 0x7F;
3430 if (CHARSET_DIMENSION (charset) > 1)
3432 ONE_MORE_BYTE (c2);
3433 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3434 /* C2 is not in a valid range. */
3435 goto invalid_code;
3436 c1 = (c1 << 8) | (c2 & 0x7F);
3437 if (CHARSET_DIMENSION (charset) > 2)
3439 ONE_MORE_BYTE (c2);
3440 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3441 /* C2 is not in a valid range. */
3442 goto invalid_code;
3443 c1 = (c1 << 8) | (c2 & 0x7F);
3447 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3448 if (c < 0)
3450 MAYBE_FINISH_COMPOSITION ();
3451 for (; src_base < src; src_base++, char_offset++)
3453 if (ASCII_BYTE_P (*src_base))
3454 *charbuf++ = *src_base;
3455 else
3456 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3459 else if (composition_state == COMPOSING_NO)
3461 *charbuf++ = c;
3462 char_offset++;
3464 else
3466 components[component_idx++] = c;
3467 if (method == COMPOSITION_WITH_RULE
3468 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3469 && composition_state == COMPOSING_COMPONENT_CHAR))
3470 composition_state++;
3472 continue;
3474 invalid_code:
3475 MAYBE_FINISH_COMPOSITION ();
3476 src = src_base;
3477 consumed_chars = consumed_chars_base;
3478 ONE_MORE_BYTE (c);
3479 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3480 char_offset++;
3481 coding->errors++;
3482 continue;
3484 break_loop:
3485 break;
3488 no_more_source:
3489 if (last_id != charset_ascii)
3490 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3491 coding->consumed_char += consumed_chars_base;
3492 coding->consumed = src_base - coding->source;
3493 coding->charbuf_used = charbuf - coding->charbuf;
3497 /* ISO2022 encoding stuff. */
3500 It is not enough to say just "ISO2022" on encoding, we have to
3501 specify more details. In Emacs, each coding system of ISO2022
3502 variant has the following specifications:
3503 1. Initial designation to G0 thru G3.
3504 2. Allows short-form designation?
3505 3. ASCII should be designated to G0 before control characters?
3506 4. ASCII should be designated to G0 at end of line?
3507 5. 7-bit environment or 8-bit environment?
3508 6. Use locking-shift?
3509 7. Use Single-shift?
3510 And the following two are only for Japanese:
3511 8. Use ASCII in place of JIS0201-1976-Roman?
3512 9. Use JISX0208-1983 in place of JISX0208-1978?
3513 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3514 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3515 details.
3518 /* Produce codes (escape sequence) for designating CHARSET to graphic
3519 register REG at DST, and increment DST. If <final-char> of CHARSET is
3520 '@', 'A', or 'B' and the coding system CODING allows, produce
3521 designation sequence of short-form. */
3523 #define ENCODE_DESIGNATION(charset, reg, coding) \
3524 do { \
3525 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3526 char *intermediate_char_94 = "()*+"; \
3527 char *intermediate_char_96 = ",-./"; \
3528 int revision = -1; \
3529 int c; \
3531 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3532 revision = CHARSET_ISO_REVISION (charset); \
3534 if (revision >= 0) \
3536 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3537 EMIT_ONE_BYTE ('@' + revision); \
3539 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3540 if (CHARSET_DIMENSION (charset) == 1) \
3542 if (! CHARSET_ISO_CHARS_96 (charset)) \
3543 c = intermediate_char_94[reg]; \
3544 else \
3545 c = intermediate_char_96[reg]; \
3546 EMIT_ONE_ASCII_BYTE (c); \
3548 else \
3550 EMIT_ONE_ASCII_BYTE ('$'); \
3551 if (! CHARSET_ISO_CHARS_96 (charset)) \
3553 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3554 || reg != 0 \
3555 || final_char < '@' || final_char > 'B') \
3556 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3558 else \
3559 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3561 EMIT_ONE_ASCII_BYTE (final_char); \
3563 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3564 } while (0)
3567 /* The following two macros produce codes (control character or escape
3568 sequence) for ISO2022 single-shift functions (single-shift-2 and
3569 single-shift-3). */
3571 #define ENCODE_SINGLE_SHIFT_2 \
3572 do { \
3573 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3574 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3575 else \
3576 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3577 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3578 } while (0)
3581 #define ENCODE_SINGLE_SHIFT_3 \
3582 do { \
3583 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3584 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3585 else \
3586 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3587 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3588 } while (0)
3591 /* The following four macros produce codes (control character or
3592 escape sequence) for ISO2022 locking-shift functions (shift-in,
3593 shift-out, locking-shift-2, and locking-shift-3). */
3595 #define ENCODE_SHIFT_IN \
3596 do { \
3597 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3598 CODING_ISO_INVOCATION (coding, 0) = 0; \
3599 } while (0)
3602 #define ENCODE_SHIFT_OUT \
3603 do { \
3604 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3605 CODING_ISO_INVOCATION (coding, 0) = 1; \
3606 } while (0)
3609 #define ENCODE_LOCKING_SHIFT_2 \
3610 do { \
3611 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3612 CODING_ISO_INVOCATION (coding, 0) = 2; \
3613 } while (0)
3616 #define ENCODE_LOCKING_SHIFT_3 \
3617 do { \
3618 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3619 CODING_ISO_INVOCATION (coding, 0) = 3; \
3620 } while (0)
3623 /* Produce codes for a DIMENSION1 character whose character set is
3624 CHARSET and whose position-code is C1. Designation and invocation
3625 sequences are also produced in advance if necessary. */
3627 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3628 do { \
3629 int id = CHARSET_ID (charset); \
3631 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3632 && id == charset_ascii) \
3634 id = charset_jisx0201_roman; \
3635 charset = CHARSET_FROM_ID (id); \
3638 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3640 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3641 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3642 else \
3643 EMIT_ONE_BYTE (c1 | 0x80); \
3644 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3645 break; \
3647 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3649 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3650 break; \
3652 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3654 EMIT_ONE_BYTE (c1 | 0x80); \
3655 break; \
3657 else \
3658 /* Since CHARSET is not yet invoked to any graphic planes, we \
3659 must invoke it, or, at first, designate it to some graphic \
3660 register. Then repeat the loop to actually produce the \
3661 character. */ \
3662 dst = encode_invocation_designation (charset, coding, dst, \
3663 &produced_chars); \
3664 } while (1)
3667 /* Produce codes for a DIMENSION2 character whose character set is
3668 CHARSET and whose position-codes are C1 and C2. Designation and
3669 invocation codes are also produced in advance if necessary. */
3671 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3672 do { \
3673 int id = CHARSET_ID (charset); \
3675 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3676 && id == charset_jisx0208) \
3678 id = charset_jisx0208_1978; \
3679 charset = CHARSET_FROM_ID (id); \
3682 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3684 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3685 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3686 else \
3687 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3688 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3689 break; \
3691 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3693 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3694 break; \
3696 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3698 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3699 break; \
3701 else \
3702 /* Since CHARSET is not yet invoked to any graphic planes, we \
3703 must invoke it, or, at first, designate it to some graphic \
3704 register. Then repeat the loop to actually produce the \
3705 character. */ \
3706 dst = encode_invocation_designation (charset, coding, dst, \
3707 &produced_chars); \
3708 } while (1)
3711 #define ENCODE_ISO_CHARACTER(charset, c) \
3712 do { \
3713 int code = ENCODE_CHAR ((charset),(c)); \
3715 if (CHARSET_DIMENSION (charset) == 1) \
3716 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3717 else \
3718 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3719 } while (0)
3722 /* Produce designation and invocation codes at a place pointed by DST
3723 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3724 Return new DST. */
3726 unsigned char *
3727 encode_invocation_designation (charset, coding, dst, p_nchars)
3728 struct charset *charset;
3729 struct coding_system *coding;
3730 unsigned char *dst;
3731 int *p_nchars;
3733 int multibytep = coding->dst_multibyte;
3734 int produced_chars = *p_nchars;
3735 int reg; /* graphic register number */
3736 int id = CHARSET_ID (charset);
3738 /* At first, check designations. */
3739 for (reg = 0; reg < 4; reg++)
3740 if (id == CODING_ISO_DESIGNATION (coding, reg))
3741 break;
3743 if (reg >= 4)
3745 /* CHARSET is not yet designated to any graphic registers. */
3746 /* At first check the requested designation. */
3747 reg = CODING_ISO_REQUEST (coding, id);
3748 if (reg < 0)
3749 /* Since CHARSET requests no special designation, designate it
3750 to graphic register 0. */
3751 reg = 0;
3753 ENCODE_DESIGNATION (charset, reg, coding);
3756 if (CODING_ISO_INVOCATION (coding, 0) != reg
3757 && CODING_ISO_INVOCATION (coding, 1) != reg)
3759 /* Since the graphic register REG is not invoked to any graphic
3760 planes, invoke it to graphic plane 0. */
3761 switch (reg)
3763 case 0: /* graphic register 0 */
3764 ENCODE_SHIFT_IN;
3765 break;
3767 case 1: /* graphic register 1 */
3768 ENCODE_SHIFT_OUT;
3769 break;
3771 case 2: /* graphic register 2 */
3772 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3773 ENCODE_SINGLE_SHIFT_2;
3774 else
3775 ENCODE_LOCKING_SHIFT_2;
3776 break;
3778 case 3: /* graphic register 3 */
3779 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3780 ENCODE_SINGLE_SHIFT_3;
3781 else
3782 ENCODE_LOCKING_SHIFT_3;
3783 break;
3787 *p_nchars = produced_chars;
3788 return dst;
3791 /* The following three macros produce codes for indicating direction
3792 of text. */
3793 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3794 do { \
3795 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3796 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3797 else \
3798 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3799 } while (0)
3802 #define ENCODE_DIRECTION_R2L() \
3803 do { \
3804 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3805 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3806 } while (0)
3809 #define ENCODE_DIRECTION_L2R() \
3810 do { \
3811 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3812 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3813 } while (0)
3816 /* Produce codes for designation and invocation to reset the graphic
3817 planes and registers to initial state. */
3818 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3819 do { \
3820 int reg; \
3821 struct charset *charset; \
3823 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3824 ENCODE_SHIFT_IN; \
3825 for (reg = 0; reg < 4; reg++) \
3826 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3827 && (CODING_ISO_DESIGNATION (coding, reg) \
3828 != CODING_ISO_INITIAL (coding, reg))) \
3830 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3831 ENCODE_DESIGNATION (charset, reg, coding); \
3833 } while (0)
3836 /* Produce designation sequences of charsets in the line started from
3837 SRC to a place pointed by DST, and return updated DST.
3839 If the current block ends before any end-of-line, we may fail to
3840 find all the necessary designations. */
3842 static unsigned char *
3843 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3844 struct coding_system *coding;
3845 int *charbuf, *charbuf_end;
3846 unsigned char *dst;
3848 struct charset *charset;
3849 /* Table of charsets to be designated to each graphic register. */
3850 int r[4];
3851 int c, found = 0, reg;
3852 int produced_chars = 0;
3853 int multibytep = coding->dst_multibyte;
3854 Lisp_Object attrs;
3855 Lisp_Object charset_list;
3857 attrs = CODING_ID_ATTRS (coding->id);
3858 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3859 if (EQ (charset_list, Qiso_2022))
3860 charset_list = Viso_2022_charset_list;
3862 for (reg = 0; reg < 4; reg++)
3863 r[reg] = -1;
3865 while (found < 4)
3867 int id;
3869 c = *charbuf++;
3870 if (c == '\n')
3871 break;
3872 charset = char_charset (c, charset_list, NULL);
3873 id = CHARSET_ID (charset);
3874 reg = CODING_ISO_REQUEST (coding, id);
3875 if (reg >= 0 && r[reg] < 0)
3877 found++;
3878 r[reg] = id;
3882 if (found)
3884 for (reg = 0; reg < 4; reg++)
3885 if (r[reg] >= 0
3886 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3887 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3890 return dst;
3893 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3895 static int
3896 encode_coding_iso_2022 (coding)
3897 struct coding_system *coding;
3899 int multibytep = coding->dst_multibyte;
3900 int *charbuf = coding->charbuf;
3901 int *charbuf_end = charbuf + coding->charbuf_used;
3902 unsigned char *dst = coding->destination + coding->produced;
3903 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3904 int safe_room = 16;
3905 int bol_designation
3906 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3907 && CODING_ISO_BOL (coding));
3908 int produced_chars = 0;
3909 Lisp_Object attrs, eol_type, charset_list;
3910 int ascii_compatible;
3911 int c;
3912 int preferred_charset_id = -1;
3914 CODING_GET_INFO (coding, attrs, charset_list);
3915 eol_type = CODING_ID_EOL_TYPE (coding->id);
3916 if (VECTORP (eol_type))
3917 eol_type = Qunix;
3919 setup_iso_safe_charsets (attrs);
3920 /* Charset list may have been changed. */
3921 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3922 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3924 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3926 while (charbuf < charbuf_end)
3928 ASSURE_DESTINATION (safe_room);
3930 if (bol_designation)
3932 unsigned char *dst_prev = dst;
3934 /* We have to produce designation sequences if any now. */
3935 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3936 bol_designation = 0;
3937 /* We are sure that designation sequences are all ASCII bytes. */
3938 produced_chars += dst - dst_prev;
3941 c = *charbuf++;
3943 if (c < 0)
3945 /* Handle an annotation. */
3946 switch (*charbuf)
3948 case CODING_ANNOTATE_COMPOSITION_MASK:
3949 /* Not yet implemented. */
3950 break;
3951 case CODING_ANNOTATE_CHARSET_MASK:
3952 preferred_charset_id = charbuf[2];
3953 if (preferred_charset_id >= 0
3954 && NILP (Fmemq (make_number (preferred_charset_id),
3955 charset_list)))
3956 preferred_charset_id = -1;
3957 break;
3958 default:
3959 abort ();
3961 charbuf += -c - 1;
3962 continue;
3965 /* Now encode the character C. */
3966 if (c < 0x20 || c == 0x7F)
3968 if (c == '\n'
3969 || (c == '\r' && EQ (eol_type, Qmac)))
3971 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3972 ENCODE_RESET_PLANE_AND_REGISTER ();
3973 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3975 int i;
3977 for (i = 0; i < 4; i++)
3978 CODING_ISO_DESIGNATION (coding, i)
3979 = CODING_ISO_INITIAL (coding, i);
3981 bol_designation
3982 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3984 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3985 ENCODE_RESET_PLANE_AND_REGISTER ();
3986 EMIT_ONE_ASCII_BYTE (c);
3988 else if (ASCII_CHAR_P (c))
3990 if (ascii_compatible)
3991 EMIT_ONE_ASCII_BYTE (c);
3992 else
3994 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3995 ENCODE_ISO_CHARACTER (charset, c);
3998 else if (CHAR_BYTE8_P (c))
4000 c = CHAR_TO_BYTE8 (c);
4001 EMIT_ONE_BYTE (c);
4003 else
4005 struct charset *charset;
4007 if (preferred_charset_id >= 0)
4009 charset = CHARSET_FROM_ID (preferred_charset_id);
4010 if (! CHAR_CHARSET_P (c, charset))
4011 charset = char_charset (c, charset_list, NULL);
4013 else
4014 charset = char_charset (c, charset_list, NULL);
4015 if (!charset)
4017 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4019 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4020 charset = CHARSET_FROM_ID (charset_ascii);
4022 else
4024 c = coding->default_char;
4025 charset = char_charset (c, charset_list, NULL);
4028 ENCODE_ISO_CHARACTER (charset, c);
4032 if (coding->mode & CODING_MODE_LAST_BLOCK
4033 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4035 ASSURE_DESTINATION (safe_room);
4036 ENCODE_RESET_PLANE_AND_REGISTER ();
4038 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4039 CODING_ISO_BOL (coding) = bol_designation;
4040 coding->produced_char += produced_chars;
4041 coding->produced = dst - coding->destination;
4042 return 0;
4046 /*** 8,9. SJIS and BIG5 handlers ***/
4048 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4049 quite widely. So, for the moment, Emacs supports them in the bare
4050 C code. But, in the future, they may be supported only by CCL. */
4052 /* SJIS is a coding system encoding three character sets: ASCII, right
4053 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4054 as is. A character of charset katakana-jisx0201 is encoded by
4055 "position-code + 0x80". A character of charset japanese-jisx0208
4056 is encoded in 2-byte but two position-codes are divided and shifted
4057 so that it fit in the range below.
4059 --- CODE RANGE of SJIS ---
4060 (character set) (range)
4061 ASCII 0x00 .. 0x7F
4062 KATAKANA-JISX0201 0xA0 .. 0xDF
4063 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4064 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4065 -------------------------------
4069 /* BIG5 is a coding system encoding two character sets: ASCII and
4070 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4071 character set and is encoded in two-byte.
4073 --- CODE RANGE of BIG5 ---
4074 (character set) (range)
4075 ASCII 0x00 .. 0x7F
4076 Big5 (1st byte) 0xA1 .. 0xFE
4077 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4078 --------------------------
4082 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4083 Check if a text is encoded in SJIS. If it is, return
4084 CATEGORY_MASK_SJIS, else return 0. */
4086 static int
4087 detect_coding_sjis (coding, detect_info)
4088 struct coding_system *coding;
4089 struct coding_detection_info *detect_info;
4091 const unsigned char *src = coding->source, *src_base;
4092 const unsigned char *src_end = coding->source + coding->src_bytes;
4093 int multibytep = coding->src_multibyte;
4094 int consumed_chars = 0;
4095 int found = 0;
4096 int c;
4098 detect_info->checked |= CATEGORY_MASK_SJIS;
4099 /* A coding system of this category is always ASCII compatible. */
4100 src += coding->head_ascii;
4102 while (1)
4104 src_base = src;
4105 ONE_MORE_BYTE (c);
4106 if (c < 0x80)
4107 continue;
4108 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4110 ONE_MORE_BYTE (c);
4111 if (c < 0x40 || c == 0x7F || c > 0xFC)
4112 break;
4113 found = CATEGORY_MASK_SJIS;
4115 else if (c >= 0xA0 && c < 0xE0)
4116 found = CATEGORY_MASK_SJIS;
4117 else
4118 break;
4120 detect_info->rejected |= CATEGORY_MASK_SJIS;
4121 return 0;
4123 no_more_source:
4124 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4126 detect_info->rejected |= CATEGORY_MASK_SJIS;
4127 return 0;
4129 detect_info->found |= found;
4130 return 1;
4133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4134 Check if a text is encoded in BIG5. If it is, return
4135 CATEGORY_MASK_BIG5, else return 0. */
4137 static int
4138 detect_coding_big5 (coding, detect_info)
4139 struct coding_system *coding;
4140 struct coding_detection_info *detect_info;
4142 const unsigned char *src = coding->source, *src_base;
4143 const unsigned char *src_end = coding->source + coding->src_bytes;
4144 int multibytep = coding->src_multibyte;
4145 int consumed_chars = 0;
4146 int found = 0;
4147 int c;
4149 detect_info->checked |= CATEGORY_MASK_BIG5;
4150 /* A coding system of this category is always ASCII compatible. */
4151 src += coding->head_ascii;
4153 while (1)
4155 src_base = src;
4156 ONE_MORE_BYTE (c);
4157 if (c < 0x80)
4158 continue;
4159 if (c >= 0xA1)
4161 ONE_MORE_BYTE (c);
4162 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4163 return 0;
4164 found = CATEGORY_MASK_BIG5;
4166 else
4167 break;
4169 detect_info->rejected |= CATEGORY_MASK_BIG5;
4170 return 0;
4172 no_more_source:
4173 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4175 detect_info->rejected |= CATEGORY_MASK_BIG5;
4176 return 0;
4178 detect_info->found |= found;
4179 return 1;
4182 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4183 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4185 static void
4186 decode_coding_sjis (coding)
4187 struct coding_system *coding;
4189 const unsigned char *src = coding->source + coding->consumed;
4190 const unsigned char *src_end = coding->source + coding->src_bytes;
4191 const unsigned char *src_base;
4192 int *charbuf = coding->charbuf + coding->charbuf_used;
4193 int *charbuf_end
4194 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4195 int consumed_chars = 0, consumed_chars_base;
4196 int multibytep = coding->src_multibyte;
4197 struct charset *charset_roman, *charset_kanji, *charset_kana;
4198 struct charset *charset_kanji2;
4199 Lisp_Object attrs, charset_list, val;
4200 int char_offset = coding->produced_char;
4201 int last_offset = char_offset;
4202 int last_id = charset_ascii;
4203 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4204 int byte_after_cr = -1;
4206 CODING_GET_INFO (coding, attrs, charset_list);
4208 val = charset_list;
4209 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4210 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4211 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4212 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4214 while (1)
4216 int c, c1;
4217 struct charset *charset;
4219 src_base = src;
4220 consumed_chars_base = consumed_chars;
4222 if (charbuf >= charbuf_end)
4223 break;
4225 if (byte_after_cr >= 0)
4226 c = byte_after_cr, byte_after_cr = -1;
4227 else
4228 ONE_MORE_BYTE (c);
4229 if (c < 0)
4230 goto invalid_code;
4231 if (c < 0x80)
4233 if (eol_crlf && c == '\r')
4234 ONE_MORE_BYTE (byte_after_cr);
4235 charset = charset_roman;
4237 else if (c == 0x80 || c == 0xA0)
4238 goto invalid_code;
4239 else if (c >= 0xA1 && c <= 0xDF)
4241 /* SJIS -> JISX0201-Kana */
4242 c &= 0x7F;
4243 charset = charset_kana;
4245 else if (c <= 0xEF)
4247 /* SJIS -> JISX0208 */
4248 ONE_MORE_BYTE (c1);
4249 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4250 goto invalid_code;
4251 c = (c << 8) | c1;
4252 SJIS_TO_JIS (c);
4253 charset = charset_kanji;
4255 else if (c <= 0xFC && charset_kanji2)
4257 /* SJIS -> JISX0213-2 */
4258 ONE_MORE_BYTE (c1);
4259 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4260 goto invalid_code;
4261 c = (c << 8) | c1;
4262 SJIS_TO_JIS2 (c);
4263 charset = charset_kanji2;
4265 else
4266 goto invalid_code;
4267 if (charset->id != charset_ascii
4268 && last_id != charset->id)
4270 if (last_id != charset_ascii)
4271 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4272 last_id = charset->id;
4273 last_offset = char_offset;
4275 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4276 *charbuf++ = c;
4277 char_offset++;
4278 continue;
4280 invalid_code:
4281 src = src_base;
4282 consumed_chars = consumed_chars_base;
4283 ONE_MORE_BYTE (c);
4284 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4285 char_offset++;
4286 coding->errors++;
4289 no_more_source:
4290 if (last_id != charset_ascii)
4291 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4292 coding->consumed_char += consumed_chars_base;
4293 coding->consumed = src_base - coding->source;
4294 coding->charbuf_used = charbuf - coding->charbuf;
4297 static void
4298 decode_coding_big5 (coding)
4299 struct coding_system *coding;
4301 const unsigned char *src = coding->source + coding->consumed;
4302 const unsigned char *src_end = coding->source + coding->src_bytes;
4303 const unsigned char *src_base;
4304 int *charbuf = coding->charbuf + coding->charbuf_used;
4305 int *charbuf_end
4306 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4307 int consumed_chars = 0, consumed_chars_base;
4308 int multibytep = coding->src_multibyte;
4309 struct charset *charset_roman, *charset_big5;
4310 Lisp_Object attrs, charset_list, val;
4311 int char_offset = coding->produced_char;
4312 int last_offset = char_offset;
4313 int last_id = charset_ascii;
4314 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4315 int byte_after_cr = -1;
4317 CODING_GET_INFO (coding, attrs, charset_list);
4318 val = charset_list;
4319 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4322 while (1)
4324 int c, c1;
4325 struct charset *charset;
4327 src_base = src;
4328 consumed_chars_base = consumed_chars;
4330 if (charbuf >= charbuf_end)
4331 break;
4333 if (byte_after_cr >= 0)
4334 c = byte_after_cr, byte_after_cr = -1;
4335 else
4336 ONE_MORE_BYTE (c);
4338 if (c < 0)
4339 goto invalid_code;
4340 if (c < 0x80)
4342 if (eol_crlf && c == '\r')
4343 ONE_MORE_BYTE (byte_after_cr);
4344 charset = charset_roman;
4346 else
4348 /* BIG5 -> Big5 */
4349 if (c < 0xA1 || c > 0xFE)
4350 goto invalid_code;
4351 ONE_MORE_BYTE (c1);
4352 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4353 goto invalid_code;
4354 c = c << 8 | c1;
4355 charset = charset_big5;
4357 if (charset->id != charset_ascii
4358 && last_id != charset->id)
4360 if (last_id != charset_ascii)
4361 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4362 last_id = charset->id;
4363 last_offset = char_offset;
4365 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4366 *charbuf++ = c;
4367 char_offset++;
4368 continue;
4370 invalid_code:
4371 src = src_base;
4372 consumed_chars = consumed_chars_base;
4373 ONE_MORE_BYTE (c);
4374 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4375 char_offset++;
4376 coding->errors++;
4379 no_more_source:
4380 if (last_id != charset_ascii)
4381 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4382 coding->consumed_char += consumed_chars_base;
4383 coding->consumed = src_base - coding->source;
4384 coding->charbuf_used = charbuf - coding->charbuf;
4387 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4388 This function can encode charsets `ascii', `katakana-jisx0201',
4389 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4390 are sure that all these charsets are registered as official charset
4391 (i.e. do not have extended leading-codes). Characters of other
4392 charsets are produced without any encoding. If SJIS_P is 1, encode
4393 SJIS text, else encode BIG5 text. */
4395 static int
4396 encode_coding_sjis (coding)
4397 struct coding_system *coding;
4399 int multibytep = coding->dst_multibyte;
4400 int *charbuf = coding->charbuf;
4401 int *charbuf_end = charbuf + coding->charbuf_used;
4402 unsigned char *dst = coding->destination + coding->produced;
4403 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4404 int safe_room = 4;
4405 int produced_chars = 0;
4406 Lisp_Object attrs, charset_list, val;
4407 int ascii_compatible;
4408 struct charset *charset_roman, *charset_kanji, *charset_kana;
4409 struct charset *charset_kanji2;
4410 int c;
4412 CODING_GET_INFO (coding, attrs, charset_list);
4413 val = charset_list;
4414 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4415 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4416 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4417 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4419 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4421 while (charbuf < charbuf_end)
4423 ASSURE_DESTINATION (safe_room);
4424 c = *charbuf++;
4425 /* Now encode the character C. */
4426 if (ASCII_CHAR_P (c) && ascii_compatible)
4427 EMIT_ONE_ASCII_BYTE (c);
4428 else if (CHAR_BYTE8_P (c))
4430 c = CHAR_TO_BYTE8 (c);
4431 EMIT_ONE_BYTE (c);
4433 else
4435 unsigned code;
4436 struct charset *charset = char_charset (c, charset_list, &code);
4438 if (!charset)
4440 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4442 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4443 charset = CHARSET_FROM_ID (charset_ascii);
4445 else
4447 c = coding->default_char;
4448 charset = char_charset (c, charset_list, &code);
4451 if (code == CHARSET_INVALID_CODE (charset))
4452 abort ();
4453 if (charset == charset_kanji)
4455 int c1, c2;
4456 JIS_TO_SJIS (code);
4457 c1 = code >> 8, c2 = code & 0xFF;
4458 EMIT_TWO_BYTES (c1, c2);
4460 else if (charset == charset_kana)
4461 EMIT_ONE_BYTE (code | 0x80);
4462 else if (charset_kanji2 && charset == charset_kanji2)
4464 int c1, c2;
4466 c1 = code >> 8;
4467 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4468 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4470 JIS_TO_SJIS2 (code);
4471 c1 = code >> 8, c2 = code & 0xFF;
4472 EMIT_TWO_BYTES (c1, c2);
4474 else
4475 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4477 else
4478 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4481 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4482 coding->produced_char += produced_chars;
4483 coding->produced = dst - coding->destination;
4484 return 0;
4487 static int
4488 encode_coding_big5 (coding)
4489 struct coding_system *coding;
4491 int multibytep = coding->dst_multibyte;
4492 int *charbuf = coding->charbuf;
4493 int *charbuf_end = charbuf + coding->charbuf_used;
4494 unsigned char *dst = coding->destination + coding->produced;
4495 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4496 int safe_room = 4;
4497 int produced_chars = 0;
4498 Lisp_Object attrs, charset_list, val;
4499 int ascii_compatible;
4500 struct charset *charset_roman, *charset_big5;
4501 int c;
4503 CODING_GET_INFO (coding, attrs, charset_list);
4504 val = charset_list;
4505 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4506 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4507 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4509 while (charbuf < charbuf_end)
4511 ASSURE_DESTINATION (safe_room);
4512 c = *charbuf++;
4513 /* Now encode the character C. */
4514 if (ASCII_CHAR_P (c) && ascii_compatible)
4515 EMIT_ONE_ASCII_BYTE (c);
4516 else if (CHAR_BYTE8_P (c))
4518 c = CHAR_TO_BYTE8 (c);
4519 EMIT_ONE_BYTE (c);
4521 else
4523 unsigned code;
4524 struct charset *charset = char_charset (c, charset_list, &code);
4526 if (! charset)
4528 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4530 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4531 charset = CHARSET_FROM_ID (charset_ascii);
4533 else
4535 c = coding->default_char;
4536 charset = char_charset (c, charset_list, &code);
4539 if (code == CHARSET_INVALID_CODE (charset))
4540 abort ();
4541 if (charset == charset_big5)
4543 int c1, c2;
4545 c1 = code >> 8, c2 = code & 0xFF;
4546 EMIT_TWO_BYTES (c1, c2);
4548 else
4549 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4552 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4553 coding->produced_char += produced_chars;
4554 coding->produced = dst - coding->destination;
4555 return 0;
4559 /*** 10. CCL handlers ***/
4561 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4562 Check if a text is encoded in a coding system of which
4563 encoder/decoder are written in CCL program. If it is, return
4564 CATEGORY_MASK_CCL, else return 0. */
4566 static int
4567 detect_coding_ccl (coding, detect_info)
4568 struct coding_system *coding;
4569 struct coding_detection_info *detect_info;
4571 const unsigned char *src = coding->source, *src_base;
4572 const unsigned char *src_end = coding->source + coding->src_bytes;
4573 int multibytep = coding->src_multibyte;
4574 int consumed_chars = 0;
4575 int found = 0;
4576 unsigned char *valids;
4577 int head_ascii = coding->head_ascii;
4578 Lisp_Object attrs;
4580 detect_info->checked |= CATEGORY_MASK_CCL;
4582 coding = &coding_categories[coding_category_ccl];
4583 valids = CODING_CCL_VALIDS (coding);
4584 attrs = CODING_ID_ATTRS (coding->id);
4585 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4586 src += head_ascii;
4588 while (1)
4590 int c;
4592 src_base = src;
4593 ONE_MORE_BYTE (c);
4594 if (c < 0 || ! valids[c])
4595 break;
4596 if ((valids[c] > 1))
4597 found = CATEGORY_MASK_CCL;
4599 detect_info->rejected |= CATEGORY_MASK_CCL;
4600 return 0;
4602 no_more_source:
4603 detect_info->found |= found;
4604 return 1;
4607 static void
4608 decode_coding_ccl (coding)
4609 struct coding_system *coding;
4611 const unsigned char *src = coding->source + coding->consumed;
4612 const unsigned char *src_end = coding->source + coding->src_bytes;
4613 int *charbuf = coding->charbuf + coding->charbuf_used;
4614 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4615 int consumed_chars = 0;
4616 int multibytep = coding->src_multibyte;
4617 struct ccl_program ccl;
4618 int source_charbuf[1024];
4619 int source_byteidx[1024];
4620 Lisp_Object attrs, charset_list;
4622 CODING_GET_INFO (coding, attrs, charset_list);
4623 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4625 while (src < src_end)
4627 const unsigned char *p = src;
4628 int *source, *source_end;
4629 int i = 0;
4631 if (multibytep)
4632 while (i < 1024 && p < src_end)
4634 source_byteidx[i] = p - src;
4635 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4637 else
4638 while (i < 1024 && p < src_end)
4639 source_charbuf[i++] = *p++;
4641 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4642 ccl.last_block = 1;
4644 source = source_charbuf;
4645 source_end = source + i;
4646 while (source < source_end)
4648 ccl_driver (&ccl, source, charbuf,
4649 source_end - source, charbuf_end - charbuf,
4650 charset_list);
4651 source += ccl.consumed;
4652 charbuf += ccl.produced;
4653 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4654 break;
4656 if (source < source_end)
4657 src += source_byteidx[source - source_charbuf];
4658 else
4659 src = p;
4660 consumed_chars += source - source_charbuf;
4662 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4663 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4664 break;
4667 switch (ccl.status)
4669 case CCL_STAT_SUSPEND_BY_SRC:
4670 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4671 break;
4672 case CCL_STAT_SUSPEND_BY_DST:
4673 break;
4674 case CCL_STAT_QUIT:
4675 case CCL_STAT_INVALID_CMD:
4676 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4677 break;
4678 default:
4679 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4680 break;
4682 coding->consumed_char += consumed_chars;
4683 coding->consumed = src - coding->source;
4684 coding->charbuf_used = charbuf - coding->charbuf;
4687 static int
4688 encode_coding_ccl (coding)
4689 struct coding_system *coding;
4691 struct ccl_program ccl;
4692 int multibytep = coding->dst_multibyte;
4693 int *charbuf = coding->charbuf;
4694 int *charbuf_end = charbuf + coding->charbuf_used;
4695 unsigned char *dst = coding->destination + coding->produced;
4696 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4697 int destination_charbuf[1024];
4698 int i, produced_chars = 0;
4699 Lisp_Object attrs, charset_list;
4701 CODING_GET_INFO (coding, attrs, charset_list);
4702 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4704 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4705 ccl.dst_multibyte = coding->dst_multibyte;
4707 while (charbuf < charbuf_end)
4709 ccl_driver (&ccl, charbuf, destination_charbuf,
4710 charbuf_end - charbuf, 1024, charset_list);
4711 if (multibytep)
4713 ASSURE_DESTINATION (ccl.produced * 2);
4714 for (i = 0; i < ccl.produced; i++)
4715 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4717 else
4719 ASSURE_DESTINATION (ccl.produced);
4720 for (i = 0; i < ccl.produced; i++)
4721 *dst++ = destination_charbuf[i] & 0xFF;
4722 produced_chars += ccl.produced;
4724 charbuf += ccl.consumed;
4725 if (ccl.status == CCL_STAT_QUIT
4726 || ccl.status == CCL_STAT_INVALID_CMD)
4727 break;
4730 switch (ccl.status)
4732 case CCL_STAT_SUSPEND_BY_SRC:
4733 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4734 break;
4735 case CCL_STAT_SUSPEND_BY_DST:
4736 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4737 break;
4738 case CCL_STAT_QUIT:
4739 case CCL_STAT_INVALID_CMD:
4740 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4741 break;
4742 default:
4743 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4744 break;
4747 coding->produced_char += produced_chars;
4748 coding->produced = dst - coding->destination;
4749 return 0;
4754 /*** 10, 11. no-conversion handlers ***/
4756 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4758 static void
4759 decode_coding_raw_text (coding)
4760 struct coding_system *coding;
4762 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4764 coding->chars_at_source = 1;
4765 coding->consumed_char = coding->src_chars;
4766 coding->consumed = coding->src_bytes;
4767 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4769 coding->consumed_char--;
4770 coding->consumed--;
4771 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4773 else
4774 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4777 static int
4778 encode_coding_raw_text (coding)
4779 struct coding_system *coding;
4781 int multibytep = coding->dst_multibyte;
4782 int *charbuf = coding->charbuf;
4783 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4784 unsigned char *dst = coding->destination + coding->produced;
4785 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4786 int produced_chars = 0;
4787 int c;
4789 if (multibytep)
4791 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4793 if (coding->src_multibyte)
4794 while (charbuf < charbuf_end)
4796 ASSURE_DESTINATION (safe_room);
4797 c = *charbuf++;
4798 if (ASCII_CHAR_P (c))
4799 EMIT_ONE_ASCII_BYTE (c);
4800 else if (CHAR_BYTE8_P (c))
4802 c = CHAR_TO_BYTE8 (c);
4803 EMIT_ONE_BYTE (c);
4805 else
4807 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4809 CHAR_STRING_ADVANCE (c, p1);
4810 while (p0 < p1)
4812 EMIT_ONE_BYTE (*p0);
4813 p0++;
4817 else
4818 while (charbuf < charbuf_end)
4820 ASSURE_DESTINATION (safe_room);
4821 c = *charbuf++;
4822 EMIT_ONE_BYTE (c);
4825 else
4827 if (coding->src_multibyte)
4829 int safe_room = MAX_MULTIBYTE_LENGTH;
4831 while (charbuf < charbuf_end)
4833 ASSURE_DESTINATION (safe_room);
4834 c = *charbuf++;
4835 if (ASCII_CHAR_P (c))
4836 *dst++ = c;
4837 else if (CHAR_BYTE8_P (c))
4838 *dst++ = CHAR_TO_BYTE8 (c);
4839 else
4840 CHAR_STRING_ADVANCE (c, dst);
4841 produced_chars++;
4844 else
4846 ASSURE_DESTINATION (charbuf_end - charbuf);
4847 while (charbuf < charbuf_end && dst < dst_end)
4848 *dst++ = *charbuf++;
4849 produced_chars = dst - (coding->destination + coding->dst_bytes);
4852 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4853 coding->produced_char += produced_chars;
4854 coding->produced = dst - coding->destination;
4855 return 0;
4858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4859 Check if a text is encoded in a charset-based coding system. If it
4860 is, return 1, else return 0. */
4862 static int
4863 detect_coding_charset (coding, detect_info)
4864 struct coding_system *coding;
4865 struct coding_detection_info *detect_info;
4867 const unsigned char *src = coding->source, *src_base;
4868 const unsigned char *src_end = coding->source + coding->src_bytes;
4869 int multibytep = coding->src_multibyte;
4870 int consumed_chars = 0;
4871 Lisp_Object attrs, valids;
4872 int found = 0;
4873 int head_ascii = coding->head_ascii;
4875 detect_info->checked |= CATEGORY_MASK_CHARSET;
4877 coding = &coding_categories[coding_category_charset];
4878 attrs = CODING_ID_ATTRS (coding->id);
4879 valids = AREF (attrs, coding_attr_charset_valids);
4881 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4882 src += head_ascii;
4884 while (1)
4886 int c;
4887 Lisp_Object val;
4888 struct charset *charset;
4889 int dim, idx;
4891 src_base = src;
4892 ONE_MORE_BYTE (c);
4893 if (c < 0)
4894 continue;
4895 val = AREF (valids, c);
4896 if (NILP (val))
4897 break;
4898 if (c >= 0x80)
4899 found = CATEGORY_MASK_CHARSET;
4900 if (INTEGERP (val))
4902 charset = CHARSET_FROM_ID (XFASTINT (val));
4903 dim = CHARSET_DIMENSION (charset);
4904 for (idx = 1; idx < dim; idx++)
4906 if (src == src_end)
4907 goto too_short;
4908 ONE_MORE_BYTE (c);
4909 if (c < charset->code_space[(dim - 1 - idx) * 2]
4910 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4911 break;
4913 if (idx < dim)
4914 break;
4916 else
4918 idx = 1;
4919 for (; CONSP (val); val = XCDR (val))
4921 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4922 dim = CHARSET_DIMENSION (charset);
4923 while (idx < dim)
4925 if (src == src_end)
4926 goto too_short;
4927 ONE_MORE_BYTE (c);
4928 if (c < charset->code_space[(dim - 1 - idx) * 4]
4929 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4930 break;
4931 idx++;
4933 if (idx == dim)
4935 val = Qnil;
4936 break;
4939 if (CONSP (val))
4940 break;
4943 too_short:
4944 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4945 return 0;
4947 no_more_source:
4948 detect_info->found |= found;
4949 return 1;
4952 static void
4953 decode_coding_charset (coding)
4954 struct coding_system *coding;
4956 const unsigned char *src = coding->source + coding->consumed;
4957 const unsigned char *src_end = coding->source + coding->src_bytes;
4958 const unsigned char *src_base;
4959 int *charbuf = coding->charbuf + coding->charbuf_used;
4960 int *charbuf_end
4961 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4962 int consumed_chars = 0, consumed_chars_base;
4963 int multibytep = coding->src_multibyte;
4964 Lisp_Object attrs, charset_list, valids;
4965 int char_offset = coding->produced_char;
4966 int last_offset = char_offset;
4967 int last_id = charset_ascii;
4968 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4969 int byte_after_cr = -1;
4971 CODING_GET_INFO (coding, attrs, charset_list);
4972 valids = AREF (attrs, coding_attr_charset_valids);
4974 while (1)
4976 int c;
4977 Lisp_Object val;
4978 struct charset *charset;
4979 int dim;
4980 int len = 1;
4981 unsigned code;
4983 src_base = src;
4984 consumed_chars_base = consumed_chars;
4986 if (charbuf >= charbuf_end)
4987 break;
4989 if (byte_after_cr >= 0)
4991 c = byte_after_cr;
4992 byte_after_cr = -1;
4994 else
4996 ONE_MORE_BYTE (c);
4997 if (eol_crlf && c == '\r')
4998 ONE_MORE_BYTE (byte_after_cr);
5000 if (c < 0)
5001 goto invalid_code;
5002 code = c;
5004 val = AREF (valids, c);
5005 if (NILP (val))
5006 goto invalid_code;
5007 if (INTEGERP (val))
5009 charset = CHARSET_FROM_ID (XFASTINT (val));
5010 dim = CHARSET_DIMENSION (charset);
5011 while (len < dim)
5013 ONE_MORE_BYTE (c);
5014 code = (code << 8) | c;
5015 len++;
5017 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5018 charset, code, c);
5020 else
5022 /* VAL is a list of charset IDs. It is assured that the
5023 list is sorted by charset dimensions (smaller one
5024 comes first). */
5025 while (CONSP (val))
5027 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5028 dim = CHARSET_DIMENSION (charset);
5029 while (len < dim)
5031 ONE_MORE_BYTE (c);
5032 code = (code << 8) | c;
5033 len++;
5035 CODING_DECODE_CHAR (coding, src, src_base,
5036 src_end, charset, code, c);
5037 if (c >= 0)
5038 break;
5039 val = XCDR (val);
5042 if (c < 0)
5043 goto invalid_code;
5044 if (charset->id != charset_ascii
5045 && last_id != charset->id)
5047 if (last_id != charset_ascii)
5048 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5049 last_id = charset->id;
5050 last_offset = char_offset;
5053 *charbuf++ = c;
5054 char_offset++;
5055 continue;
5057 invalid_code:
5058 src = src_base;
5059 consumed_chars = consumed_chars_base;
5060 ONE_MORE_BYTE (c);
5061 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5062 char_offset++;
5063 coding->errors++;
5066 no_more_source:
5067 if (last_id != charset_ascii)
5068 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5069 coding->consumed_char += consumed_chars_base;
5070 coding->consumed = src_base - coding->source;
5071 coding->charbuf_used = charbuf - coding->charbuf;
5074 static int
5075 encode_coding_charset (coding)
5076 struct coding_system *coding;
5078 int multibytep = coding->dst_multibyte;
5079 int *charbuf = coding->charbuf;
5080 int *charbuf_end = charbuf + coding->charbuf_used;
5081 unsigned char *dst = coding->destination + coding->produced;
5082 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5083 int safe_room = MAX_MULTIBYTE_LENGTH;
5084 int produced_chars = 0;
5085 Lisp_Object attrs, charset_list;
5086 int ascii_compatible;
5087 int c;
5089 CODING_GET_INFO (coding, attrs, charset_list);
5090 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5092 while (charbuf < charbuf_end)
5094 struct charset *charset;
5095 unsigned code;
5097 ASSURE_DESTINATION (safe_room);
5098 c = *charbuf++;
5099 if (ascii_compatible && ASCII_CHAR_P (c))
5100 EMIT_ONE_ASCII_BYTE (c);
5101 else if (CHAR_BYTE8_P (c))
5103 c = CHAR_TO_BYTE8 (c);
5104 EMIT_ONE_BYTE (c);
5106 else
5108 charset = char_charset (c, charset_list, &code);
5109 if (charset)
5111 if (CHARSET_DIMENSION (charset) == 1)
5112 EMIT_ONE_BYTE (code);
5113 else if (CHARSET_DIMENSION (charset) == 2)
5114 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5115 else if (CHARSET_DIMENSION (charset) == 3)
5116 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5117 else
5118 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5119 (code >> 8) & 0xFF, code & 0xFF);
5121 else
5123 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5124 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5125 else
5126 c = coding->default_char;
5127 EMIT_ONE_BYTE (c);
5132 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5133 coding->produced_char += produced_chars;
5134 coding->produced = dst - coding->destination;
5135 return 0;
5139 /*** 7. C library functions ***/
5141 /* Setup coding context CODING from information about CODING_SYSTEM.
5142 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5143 CODING_SYSTEM is invalid, signal an error. */
5145 void
5146 setup_coding_system (coding_system, coding)
5147 Lisp_Object coding_system;
5148 struct coding_system *coding;
5150 Lisp_Object attrs;
5151 Lisp_Object eol_type;
5152 Lisp_Object coding_type;
5153 Lisp_Object val;
5155 if (NILP (coding_system))
5156 coding_system = Qundecided;
5158 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5160 attrs = CODING_ID_ATTRS (coding->id);
5161 eol_type = CODING_ID_EOL_TYPE (coding->id);
5163 coding->mode = 0;
5164 coding->head_ascii = -1;
5165 if (VECTORP (eol_type))
5166 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5167 | CODING_REQUIRE_DETECTION_MASK);
5168 else if (! EQ (eol_type, Qunix))
5169 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5170 | CODING_REQUIRE_ENCODING_MASK);
5171 else
5172 coding->common_flags = 0;
5173 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5174 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5175 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5176 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5177 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5178 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5180 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5181 coding->max_charset_id = SCHARS (val) - 1;
5182 coding->safe_charsets = (char *) SDATA (val);
5183 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5185 coding_type = CODING_ATTR_TYPE (attrs);
5186 if (EQ (coding_type, Qundecided))
5188 coding->detector = NULL;
5189 coding->decoder = decode_coding_raw_text;
5190 coding->encoder = encode_coding_raw_text;
5191 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5193 else if (EQ (coding_type, Qiso_2022))
5195 int i;
5196 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5198 /* Invoke graphic register 0 to plane 0. */
5199 CODING_ISO_INVOCATION (coding, 0) = 0;
5200 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5201 CODING_ISO_INVOCATION (coding, 1)
5202 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5203 /* Setup the initial status of designation. */
5204 for (i = 0; i < 4; i++)
5205 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5206 /* Not single shifting initially. */
5207 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5208 /* Beginning of buffer should also be regarded as bol. */
5209 CODING_ISO_BOL (coding) = 1;
5210 coding->detector = detect_coding_iso_2022;
5211 coding->decoder = decode_coding_iso_2022;
5212 coding->encoder = encode_coding_iso_2022;
5213 if (flags & CODING_ISO_FLAG_SAFE)
5214 coding->mode |= CODING_MODE_SAFE_ENCODING;
5215 coding->common_flags
5216 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5217 | CODING_REQUIRE_FLUSHING_MASK);
5218 if (flags & CODING_ISO_FLAG_COMPOSITION)
5219 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5220 if (flags & CODING_ISO_FLAG_DESIGNATION)
5221 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5222 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5224 setup_iso_safe_charsets (attrs);
5225 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5226 coding->max_charset_id = SCHARS (val) - 1;
5227 coding->safe_charsets = (char *) SDATA (val);
5229 CODING_ISO_FLAGS (coding) = flags;
5231 else if (EQ (coding_type, Qcharset))
5233 coding->detector = detect_coding_charset;
5234 coding->decoder = decode_coding_charset;
5235 coding->encoder = encode_coding_charset;
5236 coding->common_flags
5237 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5239 else if (EQ (coding_type, Qutf_8))
5241 coding->detector = detect_coding_utf_8;
5242 coding->decoder = decode_coding_utf_8;
5243 coding->encoder = encode_coding_utf_8;
5244 coding->common_flags
5245 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5247 else if (EQ (coding_type, Qutf_16))
5249 val = AREF (attrs, coding_attr_utf_16_bom);
5250 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5251 : EQ (val, Qt) ? utf_16_with_bom
5252 : utf_16_without_bom);
5253 val = AREF (attrs, coding_attr_utf_16_endian);
5254 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5255 : utf_16_little_endian);
5256 CODING_UTF_16_SURROGATE (coding) = 0;
5257 coding->detector = detect_coding_utf_16;
5258 coding->decoder = decode_coding_utf_16;
5259 coding->encoder = encode_coding_utf_16;
5260 coding->common_flags
5261 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5262 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5263 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5265 else if (EQ (coding_type, Qccl))
5267 coding->detector = detect_coding_ccl;
5268 coding->decoder = decode_coding_ccl;
5269 coding->encoder = encode_coding_ccl;
5270 coding->common_flags
5271 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5272 | CODING_REQUIRE_FLUSHING_MASK);
5274 else if (EQ (coding_type, Qemacs_mule))
5276 coding->detector = detect_coding_emacs_mule;
5277 coding->decoder = decode_coding_emacs_mule;
5278 coding->encoder = encode_coding_emacs_mule;
5279 coding->common_flags
5280 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5281 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5282 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5284 Lisp_Object tail, safe_charsets;
5285 int max_charset_id = 0;
5287 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5288 tail = XCDR (tail))
5289 if (max_charset_id < XFASTINT (XCAR (tail)))
5290 max_charset_id = XFASTINT (XCAR (tail));
5291 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5292 make_number (255));
5293 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5294 tail = XCDR (tail))
5295 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5296 coding->max_charset_id = max_charset_id;
5297 coding->safe_charsets = (char *) SDATA (safe_charsets);
5300 else if (EQ (coding_type, Qshift_jis))
5302 coding->detector = detect_coding_sjis;
5303 coding->decoder = decode_coding_sjis;
5304 coding->encoder = encode_coding_sjis;
5305 coding->common_flags
5306 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5308 else if (EQ (coding_type, Qbig5))
5310 coding->detector = detect_coding_big5;
5311 coding->decoder = decode_coding_big5;
5312 coding->encoder = encode_coding_big5;
5313 coding->common_flags
5314 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5316 else /* EQ (coding_type, Qraw_text) */
5318 coding->detector = NULL;
5319 coding->decoder = decode_coding_raw_text;
5320 coding->encoder = encode_coding_raw_text;
5321 if (! EQ (eol_type, Qunix))
5323 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5324 if (! VECTORP (eol_type))
5325 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5330 return;
5333 /* Return a list of charsets supported by CODING. */
5335 Lisp_Object
5336 coding_charset_list (coding)
5337 struct coding_system *coding;
5339 Lisp_Object attrs, charset_list;
5341 CODING_GET_INFO (coding, attrs, charset_list);
5342 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5344 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5346 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5347 charset_list = Viso_2022_charset_list;
5349 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5351 charset_list = Vemacs_mule_charset_list;
5353 return charset_list;
5357 /* Return raw-text or one of its subsidiaries that has the same
5358 eol_type as CODING-SYSTEM. */
5360 Lisp_Object
5361 raw_text_coding_system (coding_system)
5362 Lisp_Object coding_system;
5364 Lisp_Object spec, attrs;
5365 Lisp_Object eol_type, raw_text_eol_type;
5367 if (NILP (coding_system))
5368 return Qraw_text;
5369 spec = CODING_SYSTEM_SPEC (coding_system);
5370 attrs = AREF (spec, 0);
5372 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5373 return coding_system;
5375 eol_type = AREF (spec, 2);
5376 if (VECTORP (eol_type))
5377 return Qraw_text;
5378 spec = CODING_SYSTEM_SPEC (Qraw_text);
5379 raw_text_eol_type = AREF (spec, 2);
5380 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5381 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5382 : AREF (raw_text_eol_type, 2));
5386 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5387 does, return one of the subsidiary that has the same eol-spec as
5388 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5389 inherit end-of-line format from the system's setting
5390 (system_eol_type). */
5392 Lisp_Object
5393 coding_inherit_eol_type (coding_system, parent)
5394 Lisp_Object coding_system, parent;
5396 Lisp_Object spec, eol_type;
5398 if (NILP (coding_system))
5399 coding_system = Qraw_text;
5400 spec = CODING_SYSTEM_SPEC (coding_system);
5401 eol_type = AREF (spec, 2);
5402 if (VECTORP (eol_type))
5404 Lisp_Object parent_eol_type;
5406 if (! NILP (parent))
5408 Lisp_Object parent_spec;
5410 parent_spec = CODING_SYSTEM_SPEC (parent);
5411 parent_eol_type = AREF (parent_spec, 2);
5413 else
5414 parent_eol_type = system_eol_type;
5415 if (EQ (parent_eol_type, Qunix))
5416 coding_system = AREF (eol_type, 0);
5417 else if (EQ (parent_eol_type, Qdos))
5418 coding_system = AREF (eol_type, 1);
5419 else if (EQ (parent_eol_type, Qmac))
5420 coding_system = AREF (eol_type, 2);
5422 return coding_system;
5425 /* Emacs has a mechanism to automatically detect a coding system if it
5426 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5427 it's impossible to distinguish some coding systems accurately
5428 because they use the same range of codes. So, at first, coding
5429 systems are categorized into 7, those are:
5431 o coding-category-emacs-mule
5433 The category for a coding system which has the same code range
5434 as Emacs' internal format. Assigned the coding-system (Lisp
5435 symbol) `emacs-mule' by default.
5437 o coding-category-sjis
5439 The category for a coding system which has the same code range
5440 as SJIS. Assigned the coding-system (Lisp
5441 symbol) `japanese-shift-jis' by default.
5443 o coding-category-iso-7
5445 The category for a coding system which has the same code range
5446 as ISO2022 of 7-bit environment. This doesn't use any locking
5447 shift and single shift functions. This can encode/decode all
5448 charsets. Assigned the coding-system (Lisp symbol)
5449 `iso-2022-7bit' by default.
5451 o coding-category-iso-7-tight
5453 Same as coding-category-iso-7 except that this can
5454 encode/decode only the specified charsets.
5456 o coding-category-iso-8-1
5458 The category for a coding system which has the same code range
5459 as ISO2022 of 8-bit environment and graphic plane 1 used only
5460 for DIMENSION1 charset. This doesn't use any locking shift
5461 and single shift functions. Assigned the coding-system (Lisp
5462 symbol) `iso-latin-1' by default.
5464 o coding-category-iso-8-2
5466 The category for a coding system which has the same code range
5467 as ISO2022 of 8-bit environment and graphic plane 1 used only
5468 for DIMENSION2 charset. This doesn't use any locking shift
5469 and single shift functions. Assigned the coding-system (Lisp
5470 symbol) `japanese-iso-8bit' by default.
5472 o coding-category-iso-7-else
5474 The category for a coding system which has the same code range
5475 as ISO2022 of 7-bit environemnt but uses locking shift or
5476 single shift functions. Assigned the coding-system (Lisp
5477 symbol) `iso-2022-7bit-lock' by default.
5479 o coding-category-iso-8-else
5481 The category for a coding system which has the same code range
5482 as ISO2022 of 8-bit environemnt but uses locking shift or
5483 single shift functions. Assigned the coding-system (Lisp
5484 symbol) `iso-2022-8bit-ss2' by default.
5486 o coding-category-big5
5488 The category for a coding system which has the same code range
5489 as BIG5. Assigned the coding-system (Lisp symbol)
5490 `cn-big5' by default.
5492 o coding-category-utf-8
5494 The category for a coding system which has the same code range
5495 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5496 symbol) `utf-8' by default.
5498 o coding-category-utf-16-be
5500 The category for a coding system in which a text has an
5501 Unicode signature (cf. Unicode Standard) in the order of BIG
5502 endian at the head. Assigned the coding-system (Lisp symbol)
5503 `utf-16-be' by default.
5505 o coding-category-utf-16-le
5507 The category for a coding system in which a text has an
5508 Unicode signature (cf. Unicode Standard) in the order of
5509 LITTLE endian at the head. Assigned the coding-system (Lisp
5510 symbol) `utf-16-le' by default.
5512 o coding-category-ccl
5514 The category for a coding system of which encoder/decoder is
5515 written in CCL programs. The default value is nil, i.e., no
5516 coding system is assigned.
5518 o coding-category-binary
5520 The category for a coding system not categorized in any of the
5521 above. Assigned the coding-system (Lisp symbol)
5522 `no-conversion' by default.
5524 Each of them is a Lisp symbol and the value is an actual
5525 `coding-system's (this is also a Lisp symbol) assigned by a user.
5526 What Emacs does actually is to detect a category of coding system.
5527 Then, it uses a `coding-system' assigned to it. If Emacs can't
5528 decide only one possible category, it selects a category of the
5529 highest priority. Priorities of categories are also specified by a
5530 user in a Lisp variable `coding-category-list'.
5534 #define EOL_SEEN_NONE 0
5535 #define EOL_SEEN_LF 1
5536 #define EOL_SEEN_CR 2
5537 #define EOL_SEEN_CRLF 4
5539 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5540 SOURCE is encoded. If CATEGORY is one of
5541 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5542 two-byte, else they are encoded by one-byte.
5544 Return one of EOL_SEEN_XXX. */
5546 #define MAX_EOL_CHECK_COUNT 3
5548 static int
5549 detect_eol (source, src_bytes, category)
5550 const unsigned char *source;
5551 EMACS_INT src_bytes;
5552 enum coding_category category;
5554 const unsigned char *src = source, *src_end = src + src_bytes;
5555 unsigned char c;
5556 int total = 0;
5557 int eol_seen = EOL_SEEN_NONE;
5559 if ((1 << category) & CATEGORY_MASK_UTF_16)
5561 int msb, lsb;
5563 msb = category == (coding_category_utf_16_le
5564 | coding_category_utf_16_le_nosig);
5565 lsb = 1 - msb;
5567 while (src + 1 < src_end)
5569 c = src[lsb];
5570 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5572 int this_eol;
5574 if (c == '\n')
5575 this_eol = EOL_SEEN_LF;
5576 else if (src + 3 >= src_end
5577 || src[msb + 2] != 0
5578 || src[lsb + 2] != '\n')
5579 this_eol = EOL_SEEN_CR;
5580 else
5581 this_eol = EOL_SEEN_CRLF;
5583 if (eol_seen == EOL_SEEN_NONE)
5584 /* This is the first end-of-line. */
5585 eol_seen = this_eol;
5586 else if (eol_seen != this_eol)
5588 /* The found type is different from what found before. */
5589 eol_seen = EOL_SEEN_LF;
5590 break;
5592 if (++total == MAX_EOL_CHECK_COUNT)
5593 break;
5595 src += 2;
5598 else
5600 while (src < src_end)
5602 c = *src++;
5603 if (c == '\n' || c == '\r')
5605 int this_eol;
5607 if (c == '\n')
5608 this_eol = EOL_SEEN_LF;
5609 else if (src >= src_end || *src != '\n')
5610 this_eol = EOL_SEEN_CR;
5611 else
5612 this_eol = EOL_SEEN_CRLF, src++;
5614 if (eol_seen == EOL_SEEN_NONE)
5615 /* This is the first end-of-line. */
5616 eol_seen = this_eol;
5617 else if (eol_seen != this_eol)
5619 /* The found type is different from what found before. */
5620 eol_seen = EOL_SEEN_LF;
5621 break;
5623 if (++total == MAX_EOL_CHECK_COUNT)
5624 break;
5628 return eol_seen;
5632 static Lisp_Object
5633 adjust_coding_eol_type (coding, eol_seen)
5634 struct coding_system *coding;
5635 int eol_seen;
5637 Lisp_Object eol_type;
5639 eol_type = CODING_ID_EOL_TYPE (coding->id);
5640 if (eol_seen & EOL_SEEN_LF)
5642 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5643 eol_type = Qunix;
5645 else if (eol_seen & EOL_SEEN_CRLF)
5647 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5648 eol_type = Qdos;
5650 else if (eol_seen & EOL_SEEN_CR)
5652 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5653 eol_type = Qmac;
5655 return eol_type;
5658 /* Detect how a text specified in CODING is encoded. If a coding
5659 system is detected, update fields of CODING by the detected coding
5660 system. */
5662 void
5663 detect_coding (coding)
5664 struct coding_system *coding;
5666 const unsigned char *src, *src_end;
5668 coding->consumed = coding->consumed_char = 0;
5669 coding->produced = coding->produced_char = 0;
5670 coding_set_source (coding);
5672 src_end = coding->source + coding->src_bytes;
5674 /* If we have not yet decided the text encoding type, detect it
5675 now. */
5676 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5678 int c, i;
5679 struct coding_detection_info detect_info;
5681 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5682 for (i = 0, src = coding->source; src < src_end; i++, src++)
5684 c = *src;
5685 if (c & 0x80)
5686 break;
5687 if (c < 0x20
5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5689 && ! inhibit_iso_escape_detection
5690 && ! detect_info.checked)
5692 coding->head_ascii = src - (coding->source + coding->consumed);
5693 if (detect_coding_iso_2022 (coding, &detect_info))
5695 /* We have scanned the whole data. */
5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5697 /* We didn't find an 8-bit code. */
5698 src = src_end;
5699 break;
5703 coding->head_ascii = src - (coding->source + coding->consumed);
5705 if (coding->head_ascii < coding->src_bytes
5706 || detect_info.found)
5708 enum coding_category category;
5709 struct coding_system *this;
5711 if (coding->head_ascii == coding->src_bytes)
5712 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5713 for (i = 0; i < coding_category_raw_text; i++)
5715 category = coding_priorities[i];
5716 this = coding_categories + category;
5717 if (detect_info.found & (1 << category))
5718 break;
5720 else
5721 for (i = 0; i < coding_category_raw_text; i++)
5723 category = coding_priorities[i];
5724 this = coding_categories + category;
5725 if (this->id < 0)
5727 /* No coding system of this category is defined. */
5728 detect_info.rejected |= (1 << category);
5730 else if (category >= coding_category_raw_text)
5731 continue;
5732 else if (detect_info.checked & (1 << category))
5734 if (detect_info.found & (1 << category))
5735 break;
5737 else if ((*(this->detector)) (coding, &detect_info)
5738 && detect_info.found & (1 << category))
5740 if (category == coding_category_utf_16_auto)
5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5743 category = coding_category_utf_16_le;
5744 else
5745 category = coding_category_utf_16_be;
5747 break;
5751 if (i < coding_category_raw_text)
5752 setup_coding_system (CODING_ID_NAME (this->id), coding);
5753 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5754 setup_coding_system (Qraw_text, coding);
5755 else if (detect_info.rejected)
5756 for (i = 0; i < coding_category_raw_text; i++)
5757 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5759 this = coding_categories + coding_priorities[i];
5760 setup_coding_system (CODING_ID_NAME (this->id), coding);
5761 break;
5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5766 == coding_category_utf_16_auto)
5768 Lisp_Object coding_systems;
5769 struct coding_detection_info detect_info;
5771 coding_systems
5772 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5773 detect_info.found = detect_info.rejected = 0;
5774 if (CONSP (coding_systems)
5775 && detect_coding_utf_16 (coding, &detect_info))
5777 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5778 setup_coding_system (XCAR (coding_systems), coding);
5779 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5780 setup_coding_system (XCDR (coding_systems), coding);
5786 static void
5787 decode_eol (coding)
5788 struct coding_system *coding;
5790 Lisp_Object eol_type;
5791 unsigned char *p, *pbeg, *pend;
5793 eol_type = CODING_ID_EOL_TYPE (coding->id);
5794 if (EQ (eol_type, Qunix))
5795 return;
5797 if (NILP (coding->dst_object))
5798 pbeg = coding->destination;
5799 else
5800 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5801 pend = pbeg + coding->produced;
5803 if (VECTORP (eol_type))
5805 int eol_seen = EOL_SEEN_NONE;
5807 for (p = pbeg; p < pend; p++)
5809 if (*p == '\n')
5810 eol_seen |= EOL_SEEN_LF;
5811 else if (*p == '\r')
5813 if (p + 1 < pend && *(p + 1) == '\n')
5815 eol_seen |= EOL_SEEN_CRLF;
5816 p++;
5818 else
5819 eol_seen |= EOL_SEEN_CR;
5822 if (eol_seen != EOL_SEEN_NONE
5823 && eol_seen != EOL_SEEN_LF
5824 && eol_seen != EOL_SEEN_CRLF
5825 && eol_seen != EOL_SEEN_CR)
5826 eol_seen = EOL_SEEN_LF;
5827 if (eol_seen != EOL_SEEN_NONE)
5828 eol_type = adjust_coding_eol_type (coding, eol_seen);
5831 if (EQ (eol_type, Qmac))
5833 for (p = pbeg; p < pend; p++)
5834 if (*p == '\r')
5835 *p = '\n';
5837 else if (EQ (eol_type, Qdos))
5839 int n = 0;
5841 if (NILP (coding->dst_object))
5843 /* Start deleting '\r' from the tail to minimize the memory
5844 movement. */
5845 for (p = pend - 2; p >= pbeg; p--)
5846 if (*p == '\r')
5848 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5849 n++;
5852 else
5854 int pos_byte = coding->dst_pos_byte;
5855 int pos = coding->dst_pos;
5856 int pos_end = pos + coding->produced_char - 1;
5858 while (pos < pos_end)
5860 p = BYTE_POS_ADDR (pos_byte);
5861 if (*p == '\r' && p[1] == '\n')
5863 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5864 n++;
5865 pos_end--;
5867 pos++;
5868 if (coding->dst_multibyte)
5869 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5870 else
5871 pos_byte++;
5874 coding->produced -= n;
5875 coding->produced_char -= n;
5880 /* Return a translation table (or list of them) from coding system
5881 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5882 decoding (ENCODEP is zero). */
5884 static Lisp_Object
5885 get_translation_table (attrs, encodep, max_lookup)
5886 Lisp_Object attrs;
5887 int encodep, *max_lookup;
5889 Lisp_Object standard, translation_table;
5890 Lisp_Object val;
5892 if (encodep)
5893 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5894 standard = Vstandard_translation_table_for_encode;
5895 else
5896 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5897 standard = Vstandard_translation_table_for_decode;
5898 if (NILP (translation_table))
5899 translation_table = standard;
5900 else
5902 if (SYMBOLP (translation_table))
5903 translation_table = Fget (translation_table, Qtranslation_table);
5904 else if (CONSP (translation_table))
5906 translation_table = Fcopy_sequence (translation_table);
5907 for (val = translation_table; CONSP (val); val = XCDR (val))
5908 if (SYMBOLP (XCAR (val)))
5909 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5911 if (CHAR_TABLE_P (standard))
5913 if (CONSP (translation_table))
5914 translation_table = nconc2 (translation_table,
5915 Fcons (standard, Qnil));
5916 else
5917 translation_table = Fcons (translation_table,
5918 Fcons (standard, Qnil));
5922 if (max_lookup)
5924 *max_lookup = 1;
5925 if (CHAR_TABLE_P (translation_table)
5926 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5928 val = XCHAR_TABLE (translation_table)->extras[1];
5929 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5930 *max_lookup = XFASTINT (val);
5932 else if (CONSP (translation_table))
5934 Lisp_Object tail, val;
5936 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5937 if (CHAR_TABLE_P (XCAR (tail))
5938 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5940 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5941 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5942 *max_lookup = XFASTINT (val);
5946 return translation_table;
5949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5950 do { \
5951 trans = Qnil; \
5952 if (CHAR_TABLE_P (table)) \
5954 trans = CHAR_TABLE_REF (table, c); \
5955 if (CHARACTERP (trans)) \
5956 c = XFASTINT (trans), trans = Qnil; \
5958 else if (CONSP (table)) \
5960 Lisp_Object tail; \
5962 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5963 if (CHAR_TABLE_P (XCAR (tail))) \
5965 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5966 if (CHARACTERP (trans)) \
5967 c = XFASTINT (trans), trans = Qnil; \
5968 else if (! NILP (trans)) \
5969 break; \
5972 } while (0)
5975 static Lisp_Object
5976 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5977 Lisp_Object val;
5978 int *buf, *buf_end;
5979 int last_block;
5980 int *from_nchars, *to_nchars;
5982 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5983 [TO-CHAR ...]. */
5984 if (CONSP (val))
5986 Lisp_Object from, tail;
5987 int i, len;
5989 for (tail = val; CONSP (tail); tail = XCDR (tail))
5991 val = XCAR (tail);
5992 from = XCAR (val);
5993 len = ASIZE (from);
5994 for (i = 0; i < len; i++)
5996 if (buf + i == buf_end)
5998 if (! last_block)
5999 return Qt;
6000 break;
6002 if (XINT (AREF (from, i)) != buf[i])
6003 break;
6005 if (i == len)
6007 val = XCDR (val);
6008 *from_nchars = len;
6009 break;
6012 if (! CONSP (tail))
6013 return Qnil;
6015 if (VECTORP (val))
6016 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6017 else
6018 *buf = XINT (val);
6019 return val;
6023 static int
6024 produce_chars (coding, translation_table, last_block)
6025 struct coding_system *coding;
6026 Lisp_Object translation_table;
6027 int last_block;
6029 unsigned char *dst = coding->destination + coding->produced;
6030 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6031 EMACS_INT produced;
6032 EMACS_INT produced_chars = 0;
6033 int carryover = 0;
6035 if (! coding->chars_at_source)
6037 /* Source characters are in coding->charbuf. */
6038 int *buf = coding->charbuf;
6039 int *buf_end = buf + coding->charbuf_used;
6041 if (EQ (coding->src_object, coding->dst_object))
6043 coding_set_source (coding);
6044 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6047 while (buf < buf_end)
6049 int c = *buf, i;
6051 if (c >= 0)
6053 int from_nchars = 1, to_nchars = 1;
6054 Lisp_Object trans = Qnil;
6056 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6057 if (! NILP (trans))
6059 trans = get_translation (trans, buf, buf_end, last_block,
6060 &from_nchars, &to_nchars);
6061 if (EQ (trans, Qt))
6062 break;
6063 c = *buf;
6066 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6068 dst = alloc_destination (coding,
6069 buf_end - buf
6070 + MAX_MULTIBYTE_LENGTH * to_nchars,
6071 dst);
6072 if (EQ (coding->src_object, coding->dst_object))
6074 coding_set_source (coding);
6075 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6077 else
6078 dst_end = coding->destination + coding->dst_bytes;
6081 for (i = 0; i < to_nchars; i++)
6083 if (i > 0)
6084 c = XINT (AREF (trans, i));
6085 if (coding->dst_multibyte
6086 || ! CHAR_BYTE8_P (c))
6087 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6088 else
6089 *dst++ = CHAR_TO_BYTE8 (c);
6091 produced_chars += to_nchars;
6092 *buf++ = to_nchars;
6093 while (--from_nchars > 0)
6094 *buf++ = 0;
6096 else
6097 /* This is an annotation datum. (-C) is the length. */
6098 buf += -c;
6100 carryover = buf_end - buf;
6102 else
6104 /* Source characters are at coding->source. */
6105 const unsigned char *src = coding->source;
6106 const unsigned char *src_end = src + coding->consumed;
6108 if (EQ (coding->dst_object, coding->src_object))
6109 dst_end = (unsigned char *) src;
6110 if (coding->src_multibyte != coding->dst_multibyte)
6112 if (coding->src_multibyte)
6114 int multibytep = 1;
6115 EMACS_INT consumed_chars;
6117 while (1)
6119 const unsigned char *src_base = src;
6120 int c;
6122 ONE_MORE_BYTE (c);
6123 if (dst == dst_end)
6125 if (EQ (coding->src_object, coding->dst_object))
6126 dst_end = (unsigned char *) src;
6127 if (dst == dst_end)
6129 EMACS_INT offset = src - coding->source;
6131 dst = alloc_destination (coding, src_end - src + 1,
6132 dst);
6133 dst_end = coding->destination + coding->dst_bytes;
6134 coding_set_source (coding);
6135 src = coding->source + offset;
6136 src_end = coding->source + coding->src_bytes;
6137 if (EQ (coding->src_object, coding->dst_object))
6138 dst_end = (unsigned char *) src;
6141 *dst++ = c;
6142 produced_chars++;
6144 no_more_source:
6147 else
6148 while (src < src_end)
6150 int multibytep = 1;
6151 int c = *src++;
6153 if (dst >= dst_end - 1)
6155 if (EQ (coding->src_object, coding->dst_object))
6156 dst_end = (unsigned char *) src;
6157 if (dst >= dst_end - 1)
6159 EMACS_INT offset = src - coding->source;
6160 EMACS_INT more_bytes;
6162 if (EQ (coding->src_object, coding->dst_object))
6163 more_bytes = ((src_end - src) / 2) + 2;
6164 else
6165 more_bytes = src_end - src + 2;
6166 dst = alloc_destination (coding, more_bytes, dst);
6167 dst_end = coding->destination + coding->dst_bytes;
6168 coding_set_source (coding);
6169 src = coding->source + offset;
6170 src_end = coding->source + coding->src_bytes;
6171 if (EQ (coding->src_object, coding->dst_object))
6172 dst_end = (unsigned char *) src;
6175 EMIT_ONE_BYTE (c);
6178 else
6180 if (!EQ (coding->src_object, coding->dst_object))
6182 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6184 if (require > 0)
6186 EMACS_INT offset = src - coding->source;
6188 dst = alloc_destination (coding, require, dst);
6189 coding_set_source (coding);
6190 src = coding->source + offset;
6191 src_end = coding->source + coding->src_bytes;
6194 produced_chars = coding->consumed_char;
6195 while (src < src_end)
6196 *dst++ = *src++;
6200 produced = dst - (coding->destination + coding->produced);
6201 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6202 insert_from_gap (produced_chars, produced);
6203 coding->produced += produced;
6204 coding->produced_char += produced_chars;
6205 return carryover;
6208 /* Compose text in CODING->object according to the annotation data at
6209 CHARBUF. CHARBUF is an array:
6210 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6213 static INLINE void
6214 produce_composition (coding, charbuf, pos)
6215 struct coding_system *coding;
6216 int *charbuf;
6217 EMACS_INT pos;
6219 int len;
6220 EMACS_INT to;
6221 enum composition_method method;
6222 Lisp_Object components;
6224 len = -charbuf[0];
6225 to = pos + charbuf[2];
6226 if (to <= pos)
6227 return;
6228 method = (enum composition_method) (charbuf[3]);
6230 if (method == COMPOSITION_RELATIVE)
6231 components = Qnil;
6232 else if (method >= COMPOSITION_WITH_RULE
6233 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6235 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6236 int i;
6238 len -= 4;
6239 charbuf += 4;
6240 for (i = 0; i < len; i++)
6242 args[i] = make_number (charbuf[i]);
6243 if (charbuf[i] < 0)
6244 return;
6246 components = (method == COMPOSITION_WITH_ALTCHARS
6247 ? Fstring (len, args) : Fvector (len, args));
6249 else
6250 return;
6251 compose_text (pos, to, components, Qnil, coding->dst_object);
6255 /* Put `charset' property on text in CODING->object according to
6256 the annotation data at CHARBUF. CHARBUF is an array:
6257 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6260 static INLINE void
6261 produce_charset (coding, charbuf, pos)
6262 struct coding_system *coding;
6263 int *charbuf;
6264 EMACS_INT pos;
6266 EMACS_INT from = pos - charbuf[2];
6267 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6269 Fput_text_property (make_number (from), make_number (pos),
6270 Qcharset, CHARSET_NAME (charset),
6271 coding->dst_object);
6275 #define CHARBUF_SIZE 0x4000
6277 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6278 do { \
6279 int size = CHARBUF_SIZE;; \
6281 coding->charbuf = NULL; \
6282 while (size > 1024) \
6284 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6285 if (coding->charbuf) \
6286 break; \
6287 size >>= 1; \
6289 if (! coding->charbuf) \
6291 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6292 return coding->result; \
6294 coding->charbuf_size = size; \
6295 } while (0)
6298 static void
6299 produce_annotation (coding, pos)
6300 struct coding_system *coding;
6301 EMACS_INT pos;
6303 int *charbuf = coding->charbuf;
6304 int *charbuf_end = charbuf + coding->charbuf_used;
6306 if (NILP (coding->dst_object))
6307 return;
6309 while (charbuf < charbuf_end)
6311 if (*charbuf >= 0)
6312 pos += *charbuf++;
6313 else
6315 int len = -*charbuf;
6316 switch (charbuf[1])
6318 case CODING_ANNOTATE_COMPOSITION_MASK:
6319 produce_composition (coding, charbuf, pos);
6320 break;
6321 case CODING_ANNOTATE_CHARSET_MASK:
6322 produce_charset (coding, charbuf, pos);
6323 break;
6324 default:
6325 abort ();
6327 charbuf += len;
6332 /* Decode the data at CODING->src_object into CODING->dst_object.
6333 CODING->src_object is a buffer, a string, or nil.
6334 CODING->dst_object is a buffer.
6336 If CODING->src_object is a buffer, it must be the current buffer.
6337 In this case, if CODING->src_pos is positive, it is a position of
6338 the source text in the buffer, otherwise, the source text is in the
6339 gap area of the buffer, and CODING->src_pos specifies the offset of
6340 the text from GPT (which must be the same as PT). If this is the
6341 same buffer as CODING->dst_object, CODING->src_pos must be
6342 negative.
6344 If CODING->src_object is a string, CODING->src_pos is an index to
6345 that string.
6347 If CODING->src_object is nil, CODING->source must already point to
6348 the non-relocatable memory area. In this case, CODING->src_pos is
6349 an offset from CODING->source.
6351 The decoded data is inserted at the current point of the buffer
6352 CODING->dst_object.
6355 static int
6356 decode_coding (coding)
6357 struct coding_system *coding;
6359 Lisp_Object attrs;
6360 Lisp_Object undo_list;
6361 Lisp_Object translation_table;
6362 int carryover;
6363 int i;
6365 if (BUFFERP (coding->src_object)
6366 && coding->src_pos > 0
6367 && coding->src_pos < GPT
6368 && coding->src_pos + coding->src_chars > GPT)
6369 move_gap_both (coding->src_pos, coding->src_pos_byte);
6371 undo_list = Qt;
6372 if (BUFFERP (coding->dst_object))
6374 if (current_buffer != XBUFFER (coding->dst_object))
6375 set_buffer_internal (XBUFFER (coding->dst_object));
6376 if (GPT != PT)
6377 move_gap_both (PT, PT_BYTE);
6378 undo_list = current_buffer->undo_list;
6379 current_buffer->undo_list = Qt;
6382 coding->consumed = coding->consumed_char = 0;
6383 coding->produced = coding->produced_char = 0;
6384 coding->chars_at_source = 0;
6385 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6386 coding->errors = 0;
6388 ALLOC_CONVERSION_WORK_AREA (coding);
6390 attrs = CODING_ID_ATTRS (coding->id);
6391 translation_table = get_translation_table (attrs, 0, NULL);
6393 carryover = 0;
6396 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6398 coding_set_source (coding);
6399 coding->annotated = 0;
6400 coding->charbuf_used = carryover;
6401 (*(coding->decoder)) (coding);
6402 coding_set_destination (coding);
6403 carryover = produce_chars (coding, translation_table, 0);
6404 if (coding->annotated)
6405 produce_annotation (coding, pos);
6406 for (i = 0; i < carryover; i++)
6407 coding->charbuf[i]
6408 = coding->charbuf[coding->charbuf_used - carryover + i];
6410 while (coding->consumed < coding->src_bytes
6411 && (coding->result == CODING_RESULT_SUCCESS
6412 || coding->result == CODING_RESULT_INVALID_SRC));
6414 if (carryover > 0)
6416 coding_set_destination (coding);
6417 coding->charbuf_used = carryover;
6418 produce_chars (coding, translation_table, 1);
6421 coding->carryover_bytes = 0;
6422 if (coding->consumed < coding->src_bytes)
6424 int nbytes = coding->src_bytes - coding->consumed;
6425 const unsigned char *src;
6427 coding_set_source (coding);
6428 coding_set_destination (coding);
6429 src = coding->source + coding->consumed;
6431 if (coding->mode & CODING_MODE_LAST_BLOCK)
6433 /* Flush out unprocessed data as binary chars. We are sure
6434 that the number of data is less than the size of
6435 coding->charbuf. */
6436 coding->charbuf_used = 0;
6437 while (nbytes-- > 0)
6439 int c = *src++;
6441 if (c & 0x80)
6442 c = BYTE8_TO_CHAR (c);
6443 coding->charbuf[coding->charbuf_used++] = c;
6445 produce_chars (coding, Qnil, 1);
6447 else
6449 /* Record unprocessed bytes in coding->carryover. We are
6450 sure that the number of data is less than the size of
6451 coding->carryover. */
6452 unsigned char *p = coding->carryover;
6454 coding->carryover_bytes = nbytes;
6455 while (nbytes-- > 0)
6456 *p++ = *src++;
6458 coding->consumed = coding->src_bytes;
6461 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6462 decode_eol (coding);
6463 if (BUFFERP (coding->dst_object))
6465 current_buffer->undo_list = undo_list;
6466 record_insert (coding->dst_pos, coding->produced_char);
6468 return coding->result;
6472 /* Extract an annotation datum from a composition starting at POS and
6473 ending before LIMIT of CODING->src_object (buffer or string), store
6474 the data in BUF, set *STOP to a starting position of the next
6475 composition (if any) or to LIMIT, and return the address of the
6476 next element of BUF.
6478 If such an annotation is not found, set *STOP to a starting
6479 position of a composition after POS (if any) or to LIMIT, and
6480 return BUF. */
6482 static INLINE int *
6483 handle_composition_annotation (pos, limit, coding, buf, stop)
6484 EMACS_INT pos, limit;
6485 struct coding_system *coding;
6486 int *buf;
6487 EMACS_INT *stop;
6489 EMACS_INT start, end;
6490 Lisp_Object prop;
6492 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6493 || end > limit)
6494 *stop = limit;
6495 else if (start > pos)
6496 *stop = start;
6497 else
6499 if (start == pos)
6501 /* We found a composition. Store the corresponding
6502 annotation data in BUF. */
6503 int *head = buf;
6504 enum composition_method method = COMPOSITION_METHOD (prop);
6505 int nchars = COMPOSITION_LENGTH (prop);
6507 ADD_COMPOSITION_DATA (buf, nchars, method);
6508 if (method != COMPOSITION_RELATIVE)
6510 Lisp_Object components;
6511 int len, i, i_byte;
6513 components = COMPOSITION_COMPONENTS (prop);
6514 if (VECTORP (components))
6516 len = XVECTOR (components)->size;
6517 for (i = 0; i < len; i++)
6518 *buf++ = XINT (AREF (components, i));
6520 else if (STRINGP (components))
6522 len = SCHARS (components);
6523 i = i_byte = 0;
6524 while (i < len)
6526 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6527 buf++;
6530 else if (INTEGERP (components))
6532 len = 1;
6533 *buf++ = XINT (components);
6535 else if (CONSP (components))
6537 for (len = 0; CONSP (components);
6538 len++, components = XCDR (components))
6539 *buf++ = XINT (XCAR (components));
6541 else
6542 abort ();
6543 *head -= len;
6547 if (find_composition (end, limit, &start, &end, &prop,
6548 coding->src_object)
6549 && end <= limit)
6550 *stop = start;
6551 else
6552 *stop = limit;
6554 return buf;
6558 /* Extract an annotation datum from a text property `charset' at POS of
6559 CODING->src_object (buffer of string), store the data in BUF, set
6560 *STOP to the position where the value of `charset' property changes
6561 (limiting by LIMIT), and return the address of the next element of
6562 BUF.
6564 If the property value is nil, set *STOP to the position where the
6565 property value is non-nil (limiting by LIMIT), and return BUF. */
6567 static INLINE int *
6568 handle_charset_annotation (pos, limit, coding, buf, stop)
6569 EMACS_INT pos, limit;
6570 struct coding_system *coding;
6571 int *buf;
6572 EMACS_INT *stop;
6574 Lisp_Object val, next;
6575 int id;
6577 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6578 if (! NILP (val) && CHARSETP (val))
6579 id = XINT (CHARSET_SYMBOL_ID (val));
6580 else
6581 id = -1;
6582 ADD_CHARSET_DATA (buf, 0, id);
6583 next = Fnext_single_property_change (make_number (pos), Qcharset,
6584 coding->src_object,
6585 make_number (limit));
6586 *stop = XINT (next);
6587 return buf;
6591 static void
6592 consume_chars (coding, translation_table, max_lookup)
6593 struct coding_system *coding;
6594 Lisp_Object translation_table;
6595 int max_lookup;
6597 int *buf = coding->charbuf;
6598 int *buf_end = coding->charbuf + coding->charbuf_size;
6599 const unsigned char *src = coding->source + coding->consumed;
6600 const unsigned char *src_end = coding->source + coding->src_bytes;
6601 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6602 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6603 int multibytep = coding->src_multibyte;
6604 Lisp_Object eol_type;
6605 int c;
6606 EMACS_INT stop, stop_composition, stop_charset;
6607 int *lookup_buf = NULL;
6609 if (! NILP (translation_table))
6610 lookup_buf = alloca (sizeof (int) * max_lookup);
6612 eol_type = CODING_ID_EOL_TYPE (coding->id);
6613 if (VECTORP (eol_type))
6614 eol_type = Qunix;
6616 /* Note: composition handling is not yet implemented. */
6617 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6619 if (NILP (coding->src_object))
6620 stop = stop_composition = stop_charset = end_pos;
6621 else
6623 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6624 stop = stop_composition = pos;
6625 else
6626 stop = stop_composition = end_pos;
6627 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6628 stop = stop_charset = pos;
6629 else
6630 stop_charset = end_pos;
6633 /* Compensate for CRLF and conversion. */
6634 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6635 while (buf < buf_end)
6637 Lisp_Object trans;
6639 if (pos == stop)
6641 if (pos == end_pos)
6642 break;
6643 if (pos == stop_composition)
6644 buf = handle_composition_annotation (pos, end_pos, coding,
6645 buf, &stop_composition);
6646 if (pos == stop_charset)
6647 buf = handle_charset_annotation (pos, end_pos, coding,
6648 buf, &stop_charset);
6649 stop = (stop_composition < stop_charset
6650 ? stop_composition : stop_charset);
6653 if (! multibytep)
6655 EMACS_INT bytes;
6657 if (coding->encoder == encode_coding_raw_text)
6658 c = *src++, pos++;
6659 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6660 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6661 else
6662 c = BYTE8_TO_CHAR (*src), src++, pos++;
6664 else
6665 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6666 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6667 c = '\n';
6668 if (! EQ (eol_type, Qunix))
6670 if (c == '\n')
6672 if (EQ (eol_type, Qdos))
6673 *buf++ = '\r';
6674 else
6675 c = '\r';
6679 trans = Qnil;
6680 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6681 if (NILP (trans))
6682 *buf++ = c;
6683 else
6685 int from_nchars = 1, to_nchars = 1;
6686 int *lookup_buf_end;
6687 const unsigned char *p = src;
6688 int i;
6690 lookup_buf[0] = c;
6691 for (i = 1; i < max_lookup && p < src_end; i++)
6692 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6693 lookup_buf_end = lookup_buf + i;
6694 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6695 &from_nchars, &to_nchars);
6696 if (EQ (trans, Qt)
6697 || buf + to_nchars > buf_end)
6698 break;
6699 *buf++ = *lookup_buf;
6700 for (i = 1; i < to_nchars; i++)
6701 *buf++ = XINT (AREF (trans, i));
6702 for (i = 1; i < from_nchars; i++, pos++)
6703 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6707 coding->consumed = src - coding->source;
6708 coding->consumed_char = pos - coding->src_pos;
6709 coding->charbuf_used = buf - coding->charbuf;
6710 coding->chars_at_source = 0;
6714 /* Encode the text at CODING->src_object into CODING->dst_object.
6715 CODING->src_object is a buffer or a string.
6716 CODING->dst_object is a buffer or nil.
6718 If CODING->src_object is a buffer, it must be the current buffer.
6719 In this case, if CODING->src_pos is positive, it is a position of
6720 the source text in the buffer, otherwise. the source text is in the
6721 gap area of the buffer, and coding->src_pos specifies the offset of
6722 the text from GPT (which must be the same as PT). If this is the
6723 same buffer as CODING->dst_object, CODING->src_pos must be
6724 negative and CODING should not have `pre-write-conversion'.
6726 If CODING->src_object is a string, CODING should not have
6727 `pre-write-conversion'.
6729 If CODING->dst_object is a buffer, the encoded data is inserted at
6730 the current point of that buffer.
6732 If CODING->dst_object is nil, the encoded data is placed at the
6733 memory area specified by CODING->destination. */
6735 static int
6736 encode_coding (coding)
6737 struct coding_system *coding;
6739 Lisp_Object attrs;
6740 Lisp_Object translation_table;
6741 int max_lookup;
6743 attrs = CODING_ID_ATTRS (coding->id);
6744 if (coding->encoder == encode_coding_raw_text)
6745 translation_table = Qnil, max_lookup = 0;
6746 else
6747 translation_table = get_translation_table (attrs, 1, &max_lookup);
6749 if (BUFFERP (coding->dst_object))
6751 set_buffer_internal (XBUFFER (coding->dst_object));
6752 coding->dst_multibyte
6753 = ! NILP (current_buffer->enable_multibyte_characters);
6756 coding->consumed = coding->consumed_char = 0;
6757 coding->produced = coding->produced_char = 0;
6758 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6759 coding->errors = 0;
6761 ALLOC_CONVERSION_WORK_AREA (coding);
6763 do {
6764 coding_set_source (coding);
6765 consume_chars (coding, translation_table, max_lookup);
6766 coding_set_destination (coding);
6767 (*(coding->encoder)) (coding);
6768 } while (coding->consumed_char < coding->src_chars);
6770 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6771 insert_from_gap (coding->produced_char, coding->produced);
6773 return (coding->result);
6777 /* Name (or base name) of work buffer for code conversion. */
6778 static Lisp_Object Vcode_conversion_workbuf_name;
6780 /* A working buffer used by the top level conversion. Once it is
6781 created, it is never destroyed. It has the name
6782 Vcode_conversion_workbuf_name. The other working buffers are
6783 destroyed after the use is finished, and their names are modified
6784 versions of Vcode_conversion_workbuf_name. */
6785 static Lisp_Object Vcode_conversion_reused_workbuf;
6787 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6788 static int reused_workbuf_in_use;
6791 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6792 multibyteness of returning buffer. */
6794 static Lisp_Object
6795 make_conversion_work_buffer (multibyte)
6796 int multibyte;
6798 Lisp_Object name, workbuf;
6799 struct buffer *current;
6801 if (reused_workbuf_in_use++)
6803 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6804 workbuf = Fget_buffer_create (name);
6806 else
6808 name = Vcode_conversion_workbuf_name;
6809 workbuf = Fget_buffer_create (name);
6810 if (NILP (Vcode_conversion_reused_workbuf))
6811 Vcode_conversion_reused_workbuf = workbuf;
6813 current = current_buffer;
6814 set_buffer_internal (XBUFFER (workbuf));
6815 Ferase_buffer ();
6816 current_buffer->undo_list = Qt;
6817 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6818 set_buffer_internal (current);
6819 return workbuf;
6823 static Lisp_Object
6824 code_conversion_restore (arg)
6825 Lisp_Object arg;
6827 Lisp_Object current, workbuf;
6828 struct gcpro gcpro1;
6830 GCPRO1 (arg);
6831 current = XCAR (arg);
6832 workbuf = XCDR (arg);
6833 if (! NILP (workbuf))
6835 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6836 reused_workbuf_in_use = 0;
6837 else if (! NILP (Fbuffer_live_p (workbuf)))
6838 Fkill_buffer (workbuf);
6840 set_buffer_internal (XBUFFER (current));
6841 UNGCPRO;
6842 return Qnil;
6845 Lisp_Object
6846 code_conversion_save (with_work_buf, multibyte)
6847 int with_work_buf, multibyte;
6849 Lisp_Object workbuf = Qnil;
6851 if (with_work_buf)
6852 workbuf = make_conversion_work_buffer (multibyte);
6853 record_unwind_protect (code_conversion_restore,
6854 Fcons (Fcurrent_buffer (), workbuf));
6855 return workbuf;
6859 decode_coding_gap (coding, chars, bytes)
6860 struct coding_system *coding;
6861 EMACS_INT chars, bytes;
6863 int count = specpdl_ptr - specpdl;
6864 Lisp_Object attrs;
6866 code_conversion_save (0, 0);
6868 coding->src_object = Fcurrent_buffer ();
6869 coding->src_chars = chars;
6870 coding->src_bytes = bytes;
6871 coding->src_pos = -chars;
6872 coding->src_pos_byte = -bytes;
6873 coding->src_multibyte = chars < bytes;
6874 coding->dst_object = coding->src_object;
6875 coding->dst_pos = PT;
6876 coding->dst_pos_byte = PT_BYTE;
6877 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6879 if (CODING_REQUIRE_DETECTION (coding))
6880 detect_coding (coding);
6882 coding->mode |= CODING_MODE_LAST_BLOCK;
6883 current_buffer->text->inhibit_shrinking = 1;
6884 decode_coding (coding);
6885 current_buffer->text->inhibit_shrinking = 0;
6887 attrs = CODING_ID_ATTRS (coding->id);
6888 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6890 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6891 Lisp_Object val;
6893 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6894 val = call1 (CODING_ATTR_POST_READ (attrs),
6895 make_number (coding->produced_char));
6896 CHECK_NATNUM (val);
6897 coding->produced_char += Z - prev_Z;
6898 coding->produced += Z_BYTE - prev_Z_BYTE;
6901 unbind_to (count, Qnil);
6902 return coding->result;
6906 encode_coding_gap (coding, chars, bytes)
6907 struct coding_system *coding;
6908 EMACS_INT chars, bytes;
6910 int count = specpdl_ptr - specpdl;
6912 code_conversion_save (0, 0);
6914 coding->src_object = Fcurrent_buffer ();
6915 coding->src_chars = chars;
6916 coding->src_bytes = bytes;
6917 coding->src_pos = -chars;
6918 coding->src_pos_byte = -bytes;
6919 coding->src_multibyte = chars < bytes;
6920 coding->dst_object = coding->src_object;
6921 coding->dst_pos = PT;
6922 coding->dst_pos_byte = PT_BYTE;
6924 encode_coding (coding);
6926 unbind_to (count, Qnil);
6927 return coding->result;
6931 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6932 SRC_OBJECT into DST_OBJECT by coding context CODING.
6934 SRC_OBJECT is a buffer, a string, or Qnil.
6936 If it is a buffer, the text is at point of the buffer. FROM and TO
6937 are positions in the buffer.
6939 If it is a string, the text is at the beginning of the string.
6940 FROM and TO are indices to the string.
6942 If it is nil, the text is at coding->source. FROM and TO are
6943 indices to coding->source.
6945 DST_OBJECT is a buffer, Qt, or Qnil.
6947 If it is a buffer, the decoded text is inserted at point of the
6948 buffer. If the buffer is the same as SRC_OBJECT, the source text
6949 is deleted.
6951 If it is Qt, a string is made from the decoded text, and
6952 set in CODING->dst_object.
6954 If it is Qnil, the decoded text is stored at CODING->destination.
6955 The caller must allocate CODING->dst_bytes bytes at
6956 CODING->destination by xmalloc. If the decoded text is longer than
6957 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6960 void
6961 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6962 dst_object)
6963 struct coding_system *coding;
6964 Lisp_Object src_object;
6965 EMACS_INT from, from_byte, to, to_byte;
6966 Lisp_Object dst_object;
6968 int count = specpdl_ptr - specpdl;
6969 unsigned char *destination;
6970 EMACS_INT dst_bytes;
6971 EMACS_INT chars = to - from;
6972 EMACS_INT bytes = to_byte - from_byte;
6973 Lisp_Object attrs;
6974 int saved_pt = -1, saved_pt_byte;
6975 int need_marker_adjustment = 0;
6976 Lisp_Object old_deactivate_mark;
6978 old_deactivate_mark = Vdeactivate_mark;
6980 if (NILP (dst_object))
6982 destination = coding->destination;
6983 dst_bytes = coding->dst_bytes;
6986 coding->src_object = src_object;
6987 coding->src_chars = chars;
6988 coding->src_bytes = bytes;
6989 coding->src_multibyte = chars < bytes;
6991 if (STRINGP (src_object))
6993 coding->src_pos = from;
6994 coding->src_pos_byte = from_byte;
6996 else if (BUFFERP (src_object))
6998 set_buffer_internal (XBUFFER (src_object));
6999 if (from != GPT)
7000 move_gap_both (from, from_byte);
7001 if (EQ (src_object, dst_object))
7003 struct Lisp_Marker *tail;
7005 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7007 tail->need_adjustment
7008 = tail->charpos == (tail->insertion_type ? from : to);
7009 need_marker_adjustment |= tail->need_adjustment;
7011 saved_pt = PT, saved_pt_byte = PT_BYTE;
7012 TEMP_SET_PT_BOTH (from, from_byte);
7013 current_buffer->text->inhibit_shrinking = 1;
7014 del_range_both (from, from_byte, to, to_byte, 1);
7015 coding->src_pos = -chars;
7016 coding->src_pos_byte = -bytes;
7018 else
7020 coding->src_pos = from;
7021 coding->src_pos_byte = from_byte;
7025 if (CODING_REQUIRE_DETECTION (coding))
7026 detect_coding (coding);
7027 attrs = CODING_ID_ATTRS (coding->id);
7029 if (EQ (dst_object, Qt)
7030 || (! NILP (CODING_ATTR_POST_READ (attrs))
7031 && NILP (dst_object)))
7033 coding->dst_object = code_conversion_save (1, 1);
7034 coding->dst_pos = BEG;
7035 coding->dst_pos_byte = BEG_BYTE;
7036 coding->dst_multibyte = 1;
7038 else if (BUFFERP (dst_object))
7040 code_conversion_save (0, 0);
7041 coding->dst_object = dst_object;
7042 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7043 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7044 coding->dst_multibyte
7045 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7047 else
7049 code_conversion_save (0, 0);
7050 coding->dst_object = Qnil;
7051 coding->dst_multibyte = 1;
7054 decode_coding (coding);
7056 if (BUFFERP (coding->dst_object))
7057 set_buffer_internal (XBUFFER (coding->dst_object));
7059 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7061 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7062 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7063 Lisp_Object val;
7065 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7066 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7067 old_deactivate_mark);
7068 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7069 make_number (coding->produced_char));
7070 UNGCPRO;
7071 CHECK_NATNUM (val);
7072 coding->produced_char += Z - prev_Z;
7073 coding->produced += Z_BYTE - prev_Z_BYTE;
7076 if (EQ (dst_object, Qt))
7078 coding->dst_object = Fbuffer_string ();
7080 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7082 set_buffer_internal (XBUFFER (coding->dst_object));
7083 if (dst_bytes < coding->produced)
7085 destination = xrealloc (destination, coding->produced);
7086 if (! destination)
7088 record_conversion_result (coding,
7089 CODING_RESULT_INSUFFICIENT_DST);
7090 unbind_to (count, Qnil);
7091 return;
7093 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7094 move_gap_both (BEGV, BEGV_BYTE);
7095 bcopy (BEGV_ADDR, destination, coding->produced);
7096 coding->destination = destination;
7100 if (saved_pt >= 0)
7102 /* This is the case of:
7103 (BUFFERP (src_object) && EQ (src_object, dst_object))
7104 As we have moved PT while replacing the original buffer
7105 contents, we must recover it now. */
7106 set_buffer_internal (XBUFFER (src_object));
7107 current_buffer->text->inhibit_shrinking = 0;
7108 if (saved_pt < from)
7109 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7110 else if (saved_pt < from + chars)
7111 TEMP_SET_PT_BOTH (from, from_byte);
7112 else if (! NILP (current_buffer->enable_multibyte_characters))
7113 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7114 saved_pt_byte + (coding->produced - bytes));
7115 else
7116 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7117 saved_pt_byte + (coding->produced - bytes));
7119 if (need_marker_adjustment)
7121 struct Lisp_Marker *tail;
7123 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7124 if (tail->need_adjustment)
7126 tail->need_adjustment = 0;
7127 if (tail->insertion_type)
7129 tail->bytepos = from_byte;
7130 tail->charpos = from;
7132 else
7134 tail->bytepos = from_byte + coding->produced;
7135 tail->charpos
7136 = (NILP (current_buffer->enable_multibyte_characters)
7137 ? tail->bytepos : from + coding->produced_char);
7143 Vdeactivate_mark = old_deactivate_mark;
7144 unbind_to (count, coding->dst_object);
7148 void
7149 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7150 dst_object)
7151 struct coding_system *coding;
7152 Lisp_Object src_object;
7153 EMACS_INT from, from_byte, to, to_byte;
7154 Lisp_Object dst_object;
7156 int count = specpdl_ptr - specpdl;
7157 EMACS_INT chars = to - from;
7158 EMACS_INT bytes = to_byte - from_byte;
7159 Lisp_Object attrs;
7160 int saved_pt = -1, saved_pt_byte;
7161 int need_marker_adjustment = 0;
7162 int kill_src_buffer = 0;
7163 Lisp_Object old_deactivate_mark;
7165 old_deactivate_mark = Vdeactivate_mark;
7167 coding->src_object = src_object;
7168 coding->src_chars = chars;
7169 coding->src_bytes = bytes;
7170 coding->src_multibyte = chars < bytes;
7172 attrs = CODING_ID_ATTRS (coding->id);
7174 if (EQ (src_object, dst_object))
7176 struct Lisp_Marker *tail;
7178 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7180 tail->need_adjustment
7181 = tail->charpos == (tail->insertion_type ? from : to);
7182 need_marker_adjustment |= tail->need_adjustment;
7186 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7188 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7189 set_buffer_internal (XBUFFER (coding->src_object));
7190 if (STRINGP (src_object))
7191 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7192 else if (BUFFERP (src_object))
7193 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7194 else
7195 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7197 if (EQ (src_object, dst_object))
7199 set_buffer_internal (XBUFFER (src_object));
7200 saved_pt = PT, saved_pt_byte = PT_BYTE;
7201 del_range_both (from, from_byte, to, to_byte, 1);
7202 set_buffer_internal (XBUFFER (coding->src_object));
7206 Lisp_Object args[3];
7207 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7209 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7210 old_deactivate_mark);
7211 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7212 args[1] = make_number (BEG);
7213 args[2] = make_number (Z);
7214 safe_call (3, args);
7215 UNGCPRO;
7217 if (XBUFFER (coding->src_object) != current_buffer)
7218 kill_src_buffer = 1;
7219 coding->src_object = Fcurrent_buffer ();
7220 if (BEG != GPT)
7221 move_gap_both (BEG, BEG_BYTE);
7222 coding->src_chars = Z - BEG;
7223 coding->src_bytes = Z_BYTE - BEG_BYTE;
7224 coding->src_pos = BEG;
7225 coding->src_pos_byte = BEG_BYTE;
7226 coding->src_multibyte = Z < Z_BYTE;
7228 else if (STRINGP (src_object))
7230 code_conversion_save (0, 0);
7231 coding->src_pos = from;
7232 coding->src_pos_byte = from_byte;
7234 else if (BUFFERP (src_object))
7236 code_conversion_save (0, 0);
7237 set_buffer_internal (XBUFFER (src_object));
7238 if (EQ (src_object, dst_object))
7240 saved_pt = PT, saved_pt_byte = PT_BYTE;
7241 coding->src_object = del_range_1 (from, to, 1, 1);
7242 coding->src_pos = 0;
7243 coding->src_pos_byte = 0;
7245 else
7247 if (from < GPT && to >= GPT)
7248 move_gap_both (from, from_byte);
7249 coding->src_pos = from;
7250 coding->src_pos_byte = from_byte;
7253 else
7254 code_conversion_save (0, 0);
7256 if (BUFFERP (dst_object))
7258 coding->dst_object = dst_object;
7259 if (EQ (src_object, dst_object))
7261 coding->dst_pos = from;
7262 coding->dst_pos_byte = from_byte;
7264 else
7266 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7267 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7269 coding->dst_multibyte
7270 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7272 else if (EQ (dst_object, Qt))
7274 coding->dst_object = Qnil;
7275 coding->dst_bytes = coding->src_chars;
7276 if (coding->dst_bytes == 0)
7277 coding->dst_bytes = 1;
7278 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7279 coding->dst_multibyte = 0;
7281 else
7283 coding->dst_object = Qnil;
7284 coding->dst_multibyte = 0;
7287 encode_coding (coding);
7289 if (EQ (dst_object, Qt))
7291 if (BUFFERP (coding->dst_object))
7292 coding->dst_object = Fbuffer_string ();
7293 else
7295 coding->dst_object
7296 = make_unibyte_string ((char *) coding->destination,
7297 coding->produced);
7298 xfree (coding->destination);
7302 if (saved_pt >= 0)
7304 /* This is the case of:
7305 (BUFFERP (src_object) && EQ (src_object, dst_object))
7306 As we have moved PT while replacing the original buffer
7307 contents, we must recover it now. */
7308 set_buffer_internal (XBUFFER (src_object));
7309 if (saved_pt < from)
7310 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7311 else if (saved_pt < from + chars)
7312 TEMP_SET_PT_BOTH (from, from_byte);
7313 else if (! NILP (current_buffer->enable_multibyte_characters))
7314 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7315 saved_pt_byte + (coding->produced - bytes));
7316 else
7317 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7318 saved_pt_byte + (coding->produced - bytes));
7320 if (need_marker_adjustment)
7322 struct Lisp_Marker *tail;
7324 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7325 if (tail->need_adjustment)
7327 tail->need_adjustment = 0;
7328 if (tail->insertion_type)
7330 tail->bytepos = from_byte;
7331 tail->charpos = from;
7333 else
7335 tail->bytepos = from_byte + coding->produced;
7336 tail->charpos
7337 = (NILP (current_buffer->enable_multibyte_characters)
7338 ? tail->bytepos : from + coding->produced_char);
7344 if (kill_src_buffer)
7345 Fkill_buffer (coding->src_object);
7347 Vdeactivate_mark = old_deactivate_mark;
7348 unbind_to (count, Qnil);
7352 Lisp_Object
7353 preferred_coding_system ()
7355 int id = coding_categories[coding_priorities[0]].id;
7357 return CODING_ID_NAME (id);
7361 #ifdef emacs
7362 /*** 8. Emacs Lisp library functions ***/
7364 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7365 doc: /* Return t if OBJECT is nil or a coding-system.
7366 See the documentation of `define-coding-system' for information
7367 about coding-system objects. */)
7368 (obj)
7369 Lisp_Object obj;
7371 if (NILP (obj)
7372 || CODING_SYSTEM_ID (obj) >= 0)
7373 return Qt;
7374 if (! SYMBOLP (obj)
7375 || NILP (Fget (obj, Qcoding_system_define_form)))
7376 return Qnil;
7377 return Qt;
7380 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7381 Sread_non_nil_coding_system, 1, 1, 0,
7382 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7383 (prompt)
7384 Lisp_Object prompt;
7386 Lisp_Object val;
7389 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7390 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7392 while (SCHARS (val) == 0);
7393 return (Fintern (val, Qnil));
7396 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7397 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7398 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7399 Ignores case when completing coding systems (all Emacs coding systems
7400 are lower-case). */)
7401 (prompt, default_coding_system)
7402 Lisp_Object prompt, default_coding_system;
7404 Lisp_Object val;
7405 int count = SPECPDL_INDEX ();
7407 if (SYMBOLP (default_coding_system))
7408 default_coding_system = SYMBOL_NAME (default_coding_system);
7409 specbind (Qcompletion_ignore_case, Qt);
7410 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7411 Qt, Qnil, Qcoding_system_history,
7412 default_coding_system, Qnil);
7413 unbind_to (count, Qnil);
7414 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7417 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7418 1, 1, 0,
7419 doc: /* Check validity of CODING-SYSTEM.
7420 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7421 It is valid if it is nil or a symbol defined as a coding system by the
7422 function `define-coding-system'. */)
7423 (coding_system)
7424 Lisp_Object coding_system;
7426 Lisp_Object define_form;
7428 define_form = Fget (coding_system, Qcoding_system_define_form);
7429 if (! NILP (define_form))
7431 Fput (coding_system, Qcoding_system_define_form, Qnil);
7432 safe_eval (define_form);
7434 if (!NILP (Fcoding_system_p (coding_system)))
7435 return coding_system;
7436 xsignal1 (Qcoding_system_error, coding_system);
7440 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7441 HIGHEST is nonzero, return the coding system of the highest
7442 priority among the detected coding systems. Otherwize return a
7443 list of detected coding systems sorted by their priorities. If
7444 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7445 multibyte form but contains only ASCII and eight-bit chars.
7446 Otherwise, the bytes are raw bytes.
7448 CODING-SYSTEM controls the detection as below:
7450 If it is nil, detect both text-format and eol-format. If the
7451 text-format part of CODING-SYSTEM is already specified
7452 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7453 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7454 detect only text-format. */
7456 Lisp_Object
7457 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7458 coding_system)
7459 const unsigned char *src;
7460 EMACS_INT src_chars, src_bytes;
7461 int highest;
7462 int multibytep;
7463 Lisp_Object coding_system;
7465 const unsigned char *src_end = src + src_bytes;
7466 Lisp_Object attrs, eol_type;
7467 Lisp_Object val;
7468 struct coding_system coding;
7469 int id;
7470 struct coding_detection_info detect_info;
7471 enum coding_category base_category;
7473 if (NILP (coding_system))
7474 coding_system = Qundecided;
7475 setup_coding_system (coding_system, &coding);
7476 attrs = CODING_ID_ATTRS (coding.id);
7477 eol_type = CODING_ID_EOL_TYPE (coding.id);
7478 coding_system = CODING_ATTR_BASE_NAME (attrs);
7480 coding.source = src;
7481 coding.src_chars = src_chars;
7482 coding.src_bytes = src_bytes;
7483 coding.src_multibyte = multibytep;
7484 coding.consumed = 0;
7485 coding.mode |= CODING_MODE_LAST_BLOCK;
7487 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7489 /* At first, detect text-format if necessary. */
7490 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7491 if (base_category == coding_category_undecided)
7493 enum coding_category category;
7494 struct coding_system *this;
7495 int c, i;
7497 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7498 for (i = 0; src < src_end; i++, src++)
7500 c = *src;
7501 if (c & 0x80)
7502 break;
7503 if (c < 0x20
7504 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7505 && ! inhibit_iso_escape_detection)
7507 coding.head_ascii = src - coding.source;
7508 if (detect_coding_iso_2022 (&coding, &detect_info))
7510 /* We have scanned the whole data. */
7511 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7512 /* We didn't find an 8-bit code. */
7513 src = src_end;
7514 break;
7518 coding.head_ascii = src - coding.source;
7520 if (src < src_end
7521 || detect_info.found)
7523 if (src == src_end)
7524 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7525 for (i = 0; i < coding_category_raw_text; i++)
7527 category = coding_priorities[i];
7528 this = coding_categories + category;
7529 if (detect_info.found & (1 << category))
7530 break;
7532 else
7533 for (i = 0; i < coding_category_raw_text; i++)
7535 category = coding_priorities[i];
7536 this = coding_categories + category;
7538 if (this->id < 0)
7540 /* No coding system of this category is defined. */
7541 detect_info.rejected |= (1 << category);
7543 else if (category >= coding_category_raw_text)
7544 continue;
7545 else if (detect_info.checked & (1 << category))
7547 if (highest
7548 && (detect_info.found & (1 << category)))
7549 break;
7551 else
7553 if ((*(this->detector)) (&coding, &detect_info)
7554 && highest
7555 && (detect_info.found & (1 << category)))
7557 if (category == coding_category_utf_16_auto)
7559 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7560 category = coding_category_utf_16_le;
7561 else
7562 category = coding_category_utf_16_be;
7564 break;
7570 if (detect_info.rejected == CATEGORY_MASK_ANY)
7572 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7573 id = coding_categories[coding_category_raw_text].id;
7574 val = Fcons (make_number (id), Qnil);
7576 else if (! detect_info.rejected && ! detect_info.found)
7578 detect_info.found = CATEGORY_MASK_ANY;
7579 id = coding_categories[coding_category_undecided].id;
7580 val = Fcons (make_number (id), Qnil);
7582 else if (highest)
7584 if (detect_info.found)
7586 detect_info.found = 1 << category;
7587 val = Fcons (make_number (this->id), Qnil);
7589 else
7590 for (i = 0; i < coding_category_raw_text; i++)
7591 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7593 detect_info.found = 1 << coding_priorities[i];
7594 id = coding_categories[coding_priorities[i]].id;
7595 val = Fcons (make_number (id), Qnil);
7596 break;
7599 else
7601 int mask = detect_info.rejected | detect_info.found;
7602 int found = 0;
7603 val = Qnil;
7605 for (i = coding_category_raw_text - 1; i >= 0; i--)
7607 category = coding_priorities[i];
7608 if (! (mask & (1 << category)))
7610 found |= 1 << category;
7611 id = coding_categories[category].id;
7612 if (id >= 0)
7613 val = Fcons (make_number (id), val);
7616 for (i = coding_category_raw_text - 1; i >= 0; i--)
7618 category = coding_priorities[i];
7619 if (detect_info.found & (1 << category))
7621 id = coding_categories[category].id;
7622 val = Fcons (make_number (id), val);
7625 detect_info.found |= found;
7628 else if (base_category == coding_category_utf_16_auto)
7630 if (detect_coding_utf_16 (&coding, &detect_info))
7632 struct coding_system *this;
7634 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7635 this = coding_categories + coding_category_utf_16_le;
7636 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7637 this = coding_categories + coding_category_utf_16_be;
7638 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7639 this = coding_categories + coding_category_utf_16_be_nosig;
7640 else
7641 this = coding_categories + coding_category_utf_16_le_nosig;
7642 val = Fcons (make_number (this->id), Qnil);
7645 else
7647 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7648 val = Fcons (make_number (coding.id), Qnil);
7651 /* Then, detect eol-format if necessary. */
7653 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7654 Lisp_Object tail;
7656 if (VECTORP (eol_type))
7658 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7659 normal_eol = detect_eol (coding.source, src_bytes,
7660 coding_category_raw_text);
7661 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7662 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7663 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7664 coding_category_utf_16_be);
7665 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7666 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7667 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7668 coding_category_utf_16_le);
7670 else
7672 if (EQ (eol_type, Qunix))
7673 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7674 else if (EQ (eol_type, Qdos))
7675 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7676 else
7677 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7680 for (tail = val; CONSP (tail); tail = XCDR (tail))
7682 enum coding_category category;
7683 int this_eol;
7685 id = XINT (XCAR (tail));
7686 attrs = CODING_ID_ATTRS (id);
7687 category = XINT (CODING_ATTR_CATEGORY (attrs));
7688 eol_type = CODING_ID_EOL_TYPE (id);
7689 if (VECTORP (eol_type))
7691 if (category == coding_category_utf_16_be
7692 || category == coding_category_utf_16_be_nosig)
7693 this_eol = utf_16_be_eol;
7694 else if (category == coding_category_utf_16_le
7695 || category == coding_category_utf_16_le_nosig)
7696 this_eol = utf_16_le_eol;
7697 else
7698 this_eol = normal_eol;
7700 if (this_eol == EOL_SEEN_LF)
7701 XSETCAR (tail, AREF (eol_type, 0));
7702 else if (this_eol == EOL_SEEN_CRLF)
7703 XSETCAR (tail, AREF (eol_type, 1));
7704 else if (this_eol == EOL_SEEN_CR)
7705 XSETCAR (tail, AREF (eol_type, 2));
7706 else
7707 XSETCAR (tail, CODING_ID_NAME (id));
7709 else
7710 XSETCAR (tail, CODING_ID_NAME (id));
7714 return (highest ? XCAR (val) : val);
7718 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7719 2, 3, 0,
7720 doc: /* Detect coding system of the text in the region between START and END.
7721 Return a list of possible coding systems ordered by priority.
7723 If only ASCII characters are found (except for such ISO-2022 control
7724 characters ISO-2022 as ESC), it returns a list of single element
7725 `undecided' or its subsidiary coding system according to a detected
7726 end-of-line format.
7728 If optional argument HIGHEST is non-nil, return the coding system of
7729 highest priority. */)
7730 (start, end, highest)
7731 Lisp_Object start, end, highest;
7733 int from, to;
7734 int from_byte, to_byte;
7736 CHECK_NUMBER_COERCE_MARKER (start);
7737 CHECK_NUMBER_COERCE_MARKER (end);
7739 validate_region (&start, &end);
7740 from = XINT (start), to = XINT (end);
7741 from_byte = CHAR_TO_BYTE (from);
7742 to_byte = CHAR_TO_BYTE (to);
7744 if (from < GPT && to >= GPT)
7745 move_gap_both (to, to_byte);
7747 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7748 to - from, to_byte - from_byte,
7749 !NILP (highest),
7750 !NILP (current_buffer
7751 ->enable_multibyte_characters),
7752 Qnil);
7755 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7756 1, 2, 0,
7757 doc: /* Detect coding system of the text in STRING.
7758 Return a list of possible coding systems ordered by priority.
7760 If only ASCII characters are found (except for such ISO-2022 control
7761 characters ISO-2022 as ESC), it returns a list of single element
7762 `undecided' or its subsidiary coding system according to a detected
7763 end-of-line format.
7765 If optional argument HIGHEST is non-nil, return the coding system of
7766 highest priority. */)
7767 (string, highest)
7768 Lisp_Object string, highest;
7770 CHECK_STRING (string);
7772 return detect_coding_system (SDATA (string),
7773 SCHARS (string), SBYTES (string),
7774 !NILP (highest), STRING_MULTIBYTE (string),
7775 Qnil);
7779 static INLINE int
7780 char_encodable_p (c, attrs)
7781 int c;
7782 Lisp_Object attrs;
7784 Lisp_Object tail;
7785 struct charset *charset;
7786 Lisp_Object translation_table;
7788 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7789 if (! NILP (translation_table))
7790 c = translate_char (translation_table, c);
7791 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7792 CONSP (tail); tail = XCDR (tail))
7794 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7795 if (CHAR_CHARSET_P (c, charset))
7796 break;
7798 return (! NILP (tail));
7802 /* Return a list of coding systems that safely encode the text between
7803 START and END. If EXCLUDE is non-nil, it is a list of coding
7804 systems not to check. The returned list doesn't contain any such
7805 coding systems. In any case, if the text contains only ASCII or is
7806 unibyte, return t. */
7808 DEFUN ("find-coding-systems-region-internal",
7809 Ffind_coding_systems_region_internal,
7810 Sfind_coding_systems_region_internal, 2, 3, 0,
7811 doc: /* Internal use only. */)
7812 (start, end, exclude)
7813 Lisp_Object start, end, exclude;
7815 Lisp_Object coding_attrs_list, safe_codings;
7816 EMACS_INT start_byte, end_byte;
7817 const unsigned char *p, *pbeg, *pend;
7818 int c;
7819 Lisp_Object tail, elt;
7821 if (STRINGP (start))
7823 if (!STRING_MULTIBYTE (start)
7824 || SCHARS (start) == SBYTES (start))
7825 return Qt;
7826 start_byte = 0;
7827 end_byte = SBYTES (start);
7829 else
7831 CHECK_NUMBER_COERCE_MARKER (start);
7832 CHECK_NUMBER_COERCE_MARKER (end);
7833 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7834 args_out_of_range (start, end);
7835 if (NILP (current_buffer->enable_multibyte_characters))
7836 return Qt;
7837 start_byte = CHAR_TO_BYTE (XINT (start));
7838 end_byte = CHAR_TO_BYTE (XINT (end));
7839 if (XINT (end) - XINT (start) == end_byte - start_byte)
7840 return Qt;
7842 if (XINT (start) < GPT && XINT (end) > GPT)
7844 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7845 move_gap_both (XINT (start), start_byte);
7846 else
7847 move_gap_both (XINT (end), end_byte);
7851 coding_attrs_list = Qnil;
7852 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7853 if (NILP (exclude)
7854 || NILP (Fmemq (XCAR (tail), exclude)))
7856 Lisp_Object attrs;
7858 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7859 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7860 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7862 ASET (attrs, coding_attr_trans_tbl,
7863 get_translation_table (attrs, 1, NULL));
7864 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7868 if (STRINGP (start))
7869 p = pbeg = SDATA (start);
7870 else
7871 p = pbeg = BYTE_POS_ADDR (start_byte);
7872 pend = p + (end_byte - start_byte);
7874 while (p < pend && ASCII_BYTE_P (*p)) p++;
7875 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7877 while (p < pend)
7879 if (ASCII_BYTE_P (*p))
7880 p++;
7881 else
7883 c = STRING_CHAR_ADVANCE (p);
7885 charset_map_loaded = 0;
7886 for (tail = coding_attrs_list; CONSP (tail);)
7888 elt = XCAR (tail);
7889 if (NILP (elt))
7890 tail = XCDR (tail);
7891 else if (char_encodable_p (c, elt))
7892 tail = XCDR (tail);
7893 else if (CONSP (XCDR (tail)))
7895 XSETCAR (tail, XCAR (XCDR (tail)));
7896 XSETCDR (tail, XCDR (XCDR (tail)));
7898 else
7900 XSETCAR (tail, Qnil);
7901 tail = XCDR (tail);
7904 if (charset_map_loaded)
7906 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7908 if (STRINGP (start))
7909 pbeg = SDATA (start);
7910 else
7911 pbeg = BYTE_POS_ADDR (start_byte);
7912 p = pbeg + p_offset;
7913 pend = pbeg + pend_offset;
7918 safe_codings = list2 (Qraw_text, Qno_conversion);
7919 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7920 if (! NILP (XCAR (tail)))
7921 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7923 return safe_codings;
7927 DEFUN ("unencodable-char-position", Funencodable_char_position,
7928 Sunencodable_char_position, 3, 5, 0,
7929 doc: /*
7930 Return position of first un-encodable character in a region.
7931 START and END specfiy the region and CODING-SYSTEM specifies the
7932 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7934 If optional 4th argument COUNT is non-nil, it specifies at most how
7935 many un-encodable characters to search. In this case, the value is a
7936 list of positions.
7938 If optional 5th argument STRING is non-nil, it is a string to search
7939 for un-encodable characters. In that case, START and END are indexes
7940 to the string. */)
7941 (start, end, coding_system, count, string)
7942 Lisp_Object start, end, coding_system, count, string;
7944 int n;
7945 struct coding_system coding;
7946 Lisp_Object attrs, charset_list, translation_table;
7947 Lisp_Object positions;
7948 int from, to;
7949 const unsigned char *p, *stop, *pend;
7950 int ascii_compatible;
7952 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7953 attrs = CODING_ID_ATTRS (coding.id);
7954 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7955 return Qnil;
7956 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7957 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7958 translation_table = get_translation_table (attrs, 1, NULL);
7960 if (NILP (string))
7962 validate_region (&start, &end);
7963 from = XINT (start);
7964 to = XINT (end);
7965 if (NILP (current_buffer->enable_multibyte_characters)
7966 || (ascii_compatible
7967 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7968 return Qnil;
7969 p = CHAR_POS_ADDR (from);
7970 pend = CHAR_POS_ADDR (to);
7971 if (from < GPT && to >= GPT)
7972 stop = GPT_ADDR;
7973 else
7974 stop = pend;
7976 else
7978 CHECK_STRING (string);
7979 CHECK_NATNUM (start);
7980 CHECK_NATNUM (end);
7981 from = XINT (start);
7982 to = XINT (end);
7983 if (from > to
7984 || to > SCHARS (string))
7985 args_out_of_range_3 (string, start, end);
7986 if (! STRING_MULTIBYTE (string))
7987 return Qnil;
7988 p = SDATA (string) + string_char_to_byte (string, from);
7989 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7990 if (ascii_compatible && (to - from) == (pend - p))
7991 return Qnil;
7994 if (NILP (count))
7995 n = 1;
7996 else
7998 CHECK_NATNUM (count);
7999 n = XINT (count);
8002 positions = Qnil;
8003 while (1)
8005 int c;
8007 if (ascii_compatible)
8008 while (p < stop && ASCII_BYTE_P (*p))
8009 p++, from++;
8010 if (p >= stop)
8012 if (p >= pend)
8013 break;
8014 stop = pend;
8015 p = GAP_END_ADDR;
8018 c = STRING_CHAR_ADVANCE (p);
8019 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8020 && ! char_charset (translate_char (translation_table, c),
8021 charset_list, NULL))
8023 positions = Fcons (make_number (from), positions);
8024 n--;
8025 if (n == 0)
8026 break;
8029 from++;
8032 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8036 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8037 Scheck_coding_systems_region, 3, 3, 0,
8038 doc: /* Check if the region is encodable by coding systems.
8040 START and END are buffer positions specifying the region.
8041 CODING-SYSTEM-LIST is a list of coding systems to check.
8043 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8044 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
8045 whole region, POS0, POS1, ... are buffer positions where non-encodable
8046 characters are found.
8048 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8049 value is nil.
8051 START may be a string. In that case, check if the string is
8052 encodable, and the value contains indices to the string instead of
8053 buffer positions. END is ignored. */)
8054 (start, end, coding_system_list)
8055 Lisp_Object start, end, coding_system_list;
8057 Lisp_Object list;
8058 EMACS_INT start_byte, end_byte;
8059 int pos;
8060 const unsigned char *p, *pbeg, *pend;
8061 int c;
8062 Lisp_Object tail, elt, attrs;
8064 if (STRINGP (start))
8066 if (!STRING_MULTIBYTE (start)
8067 && SCHARS (start) != SBYTES (start))
8068 return Qnil;
8069 start_byte = 0;
8070 end_byte = SBYTES (start);
8071 pos = 0;
8073 else
8075 CHECK_NUMBER_COERCE_MARKER (start);
8076 CHECK_NUMBER_COERCE_MARKER (end);
8077 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8078 args_out_of_range (start, end);
8079 if (NILP (current_buffer->enable_multibyte_characters))
8080 return Qnil;
8081 start_byte = CHAR_TO_BYTE (XINT (start));
8082 end_byte = CHAR_TO_BYTE (XINT (end));
8083 if (XINT (end) - XINT (start) == end_byte - start_byte)
8084 return Qt;
8086 if (XINT (start) < GPT && XINT (end) > GPT)
8088 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8089 move_gap_both (XINT (start), start_byte);
8090 else
8091 move_gap_both (XINT (end), end_byte);
8093 pos = XINT (start);
8096 list = Qnil;
8097 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8099 elt = XCAR (tail);
8100 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8101 ASET (attrs, coding_attr_trans_tbl,
8102 get_translation_table (attrs, 1, NULL));
8103 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8106 if (STRINGP (start))
8107 p = pbeg = SDATA (start);
8108 else
8109 p = pbeg = BYTE_POS_ADDR (start_byte);
8110 pend = p + (end_byte - start_byte);
8112 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8113 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8115 while (p < pend)
8117 if (ASCII_BYTE_P (*p))
8118 p++;
8119 else
8121 c = STRING_CHAR_ADVANCE (p);
8123 charset_map_loaded = 0;
8124 for (tail = list; CONSP (tail); tail = XCDR (tail))
8126 elt = XCDR (XCAR (tail));
8127 if (! char_encodable_p (c, XCAR (elt)))
8128 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8130 if (charset_map_loaded)
8132 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8134 if (STRINGP (start))
8135 pbeg = SDATA (start);
8136 else
8137 pbeg = BYTE_POS_ADDR (start_byte);
8138 p = pbeg + p_offset;
8139 pend = pbeg + pend_offset;
8142 pos++;
8145 tail = list;
8146 list = Qnil;
8147 for (; CONSP (tail); tail = XCDR (tail))
8149 elt = XCAR (tail);
8150 if (CONSP (XCDR (XCDR (elt))))
8151 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8152 list);
8155 return list;
8159 Lisp_Object
8160 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8161 Lisp_Object start, end, coding_system, dst_object;
8162 int encodep, norecord;
8164 struct coding_system coding;
8165 EMACS_INT from, from_byte, to, to_byte;
8166 Lisp_Object src_object;
8168 CHECK_NUMBER_COERCE_MARKER (start);
8169 CHECK_NUMBER_COERCE_MARKER (end);
8170 if (NILP (coding_system))
8171 coding_system = Qno_conversion;
8172 else
8173 CHECK_CODING_SYSTEM (coding_system);
8174 src_object = Fcurrent_buffer ();
8175 if (NILP (dst_object))
8176 dst_object = src_object;
8177 else if (! EQ (dst_object, Qt))
8178 CHECK_BUFFER (dst_object);
8180 validate_region (&start, &end);
8181 from = XFASTINT (start);
8182 from_byte = CHAR_TO_BYTE (from);
8183 to = XFASTINT (end);
8184 to_byte = CHAR_TO_BYTE (to);
8186 setup_coding_system (coding_system, &coding);
8187 coding.mode |= CODING_MODE_LAST_BLOCK;
8189 if (encodep)
8190 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8191 dst_object);
8192 else
8193 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8194 dst_object);
8195 if (! norecord)
8196 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8198 return (BUFFERP (dst_object)
8199 ? make_number (coding.produced_char)
8200 : coding.dst_object);
8204 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8205 3, 4, "r\nzCoding system: ",
8206 doc: /* Decode the current region from the specified coding system.
8207 When called from a program, takes four arguments:
8208 START, END, CODING-SYSTEM, and DESTINATION.
8209 START and END are buffer positions.
8211 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8212 If nil, the region between START and END is replaced by the decoded text.
8213 If buffer, the decoded text is inserted in the buffer.
8214 If t, the decoded text is returned.
8216 This function sets `last-coding-system-used' to the precise coding system
8217 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8218 not fully specified.)
8219 It returns the length of the decoded text. */)
8220 (start, end, coding_system, destination)
8221 Lisp_Object start, end, coding_system, destination;
8223 return code_convert_region (start, end, coding_system, destination, 0, 0);
8226 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8227 3, 4, "r\nzCoding system: ",
8228 doc: /* Encode the current region by specified coding system.
8229 When called from a program, takes three arguments:
8230 START, END, and CODING-SYSTEM. START and END are buffer positions.
8232 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8233 If nil, the region between START and END is replace by the encoded text.
8234 If buffer, the encoded text is inserted in the buffer.
8235 If t, the encoded text is returned.
8237 This function sets `last-coding-system-used' to the precise coding system
8238 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8239 not fully specified.)
8240 It returns the length of the encoded text. */)
8241 (start, end, coding_system, destination)
8242 Lisp_Object start, end, coding_system, destination;
8244 return code_convert_region (start, end, coding_system, destination, 1, 0);
8247 Lisp_Object
8248 code_convert_string (string, coding_system, dst_object,
8249 encodep, nocopy, norecord)
8250 Lisp_Object string, coding_system, dst_object;
8251 int encodep, nocopy, norecord;
8253 struct coding_system coding;
8254 EMACS_INT chars, bytes;
8256 CHECK_STRING (string);
8257 if (NILP (coding_system))
8259 if (! norecord)
8260 Vlast_coding_system_used = Qno_conversion;
8261 if (NILP (dst_object))
8262 return (nocopy ? Fcopy_sequence (string) : string);
8265 if (NILP (coding_system))
8266 coding_system = Qno_conversion;
8267 else
8268 CHECK_CODING_SYSTEM (coding_system);
8269 if (NILP (dst_object))
8270 dst_object = Qt;
8271 else if (! EQ (dst_object, Qt))
8272 CHECK_BUFFER (dst_object);
8274 setup_coding_system (coding_system, &coding);
8275 coding.mode |= CODING_MODE_LAST_BLOCK;
8276 chars = SCHARS (string);
8277 bytes = SBYTES (string);
8278 if (encodep)
8279 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8280 else
8281 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8282 if (! norecord)
8283 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8285 return (BUFFERP (dst_object)
8286 ? make_number (coding.produced_char)
8287 : coding.dst_object);
8291 /* Encode or decode STRING according to CODING_SYSTEM.
8292 Do not set Vlast_coding_system_used.
8294 This function is called only from macros DECODE_FILE and
8295 ENCODE_FILE, thus we ignore character composition. */
8297 Lisp_Object
8298 code_convert_string_norecord (string, coding_system, encodep)
8299 Lisp_Object string, coding_system;
8300 int encodep;
8302 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8306 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8307 2, 4, 0,
8308 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8310 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8311 if the decoding operation is trivial.
8313 Optional fourth arg BUFFER non-nil meant that the decoded text is
8314 inserted in BUFFER instead of returned as a string. In this case,
8315 the return value is BUFFER.
8317 This function sets `last-coding-system-used' to the precise coding system
8318 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8319 not fully specified. */)
8320 (string, coding_system, nocopy, buffer)
8321 Lisp_Object string, coding_system, nocopy, buffer;
8323 return code_convert_string (string, coding_system, buffer,
8324 0, ! NILP (nocopy), 0);
8327 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8328 2, 4, 0,
8329 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8331 Optional third arg NOCOPY non-nil means it is OK to return STRING
8332 itself if the encoding operation is trivial.
8334 Optional fourth arg BUFFER non-nil meant that the encoded text is
8335 inserted in BUFFER instead of returned as a string. In this case,
8336 the return value is BUFFER.
8338 This function sets `last-coding-system-used' to the precise coding system
8339 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8340 not fully specified.) */)
8341 (string, coding_system, nocopy, buffer)
8342 Lisp_Object string, coding_system, nocopy, buffer;
8344 return code_convert_string (string, coding_system, buffer,
8345 1, ! NILP (nocopy), 1);
8349 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8350 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8351 Return the corresponding character. */)
8352 (code)
8353 Lisp_Object code;
8355 Lisp_Object spec, attrs, val;
8356 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8357 int c;
8359 CHECK_NATNUM (code);
8360 c = XFASTINT (code);
8361 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8362 attrs = AREF (spec, 0);
8364 if (ASCII_BYTE_P (c)
8365 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8366 return code;
8368 val = CODING_ATTR_CHARSET_LIST (attrs);
8369 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8370 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8371 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8373 if (c <= 0x7F)
8374 charset = charset_roman;
8375 else if (c >= 0xA0 && c < 0xDF)
8377 charset = charset_kana;
8378 c -= 0x80;
8380 else
8382 int s1 = c >> 8, s2 = c & 0xFF;
8384 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8385 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8386 error ("Invalid code: %d", code);
8387 SJIS_TO_JIS (c);
8388 charset = charset_kanji;
8390 c = DECODE_CHAR (charset, c);
8391 if (c < 0)
8392 error ("Invalid code: %d", code);
8393 return make_number (c);
8397 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8398 doc: /* Encode a Japanese character CH to shift_jis encoding.
8399 Return the corresponding code in SJIS. */)
8400 (ch)
8401 Lisp_Object ch;
8403 Lisp_Object spec, attrs, charset_list;
8404 int c;
8405 struct charset *charset;
8406 unsigned code;
8408 CHECK_CHARACTER (ch);
8409 c = XFASTINT (ch);
8410 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8411 attrs = AREF (spec, 0);
8413 if (ASCII_CHAR_P (c)
8414 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8415 return ch;
8417 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8418 charset = char_charset (c, charset_list, &code);
8419 if (code == CHARSET_INVALID_CODE (charset))
8420 error ("Can't encode by shift_jis encoding: %d", c);
8421 JIS_TO_SJIS (code);
8423 return make_number (code);
8426 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8427 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8428 Return the corresponding character. */)
8429 (code)
8430 Lisp_Object code;
8432 Lisp_Object spec, attrs, val;
8433 struct charset *charset_roman, *charset_big5, *charset;
8434 int c;
8436 CHECK_NATNUM (code);
8437 c = XFASTINT (code);
8438 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8439 attrs = AREF (spec, 0);
8441 if (ASCII_BYTE_P (c)
8442 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8443 return code;
8445 val = CODING_ATTR_CHARSET_LIST (attrs);
8446 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8447 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8449 if (c <= 0x7F)
8450 charset = charset_roman;
8451 else
8453 int b1 = c >> 8, b2 = c & 0x7F;
8454 if (b1 < 0xA1 || b1 > 0xFE
8455 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8456 error ("Invalid code: %d", code);
8457 charset = charset_big5;
8459 c = DECODE_CHAR (charset, (unsigned )c);
8460 if (c < 0)
8461 error ("Invalid code: %d", code);
8462 return make_number (c);
8465 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8466 doc: /* Encode the Big5 character CH to BIG5 coding system.
8467 Return the corresponding character code in Big5. */)
8468 (ch)
8469 Lisp_Object ch;
8471 Lisp_Object spec, attrs, charset_list;
8472 struct charset *charset;
8473 int c;
8474 unsigned code;
8476 CHECK_CHARACTER (ch);
8477 c = XFASTINT (ch);
8478 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8479 attrs = AREF (spec, 0);
8480 if (ASCII_CHAR_P (c)
8481 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8482 return ch;
8484 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8485 charset = char_charset (c, charset_list, &code);
8486 if (code == CHARSET_INVALID_CODE (charset))
8487 error ("Can't encode by Big5 encoding: %d", c);
8489 return make_number (code);
8493 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8494 Sset_terminal_coding_system_internal, 1, 2, 0,
8495 doc: /* Internal use only. */)
8496 (coding_system, terminal)
8497 Lisp_Object coding_system;
8498 Lisp_Object terminal;
8500 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8501 CHECK_SYMBOL (coding_system);
8502 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8503 /* We had better not send unsafe characters to terminal. */
8504 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8505 /* Characer composition should be disabled. */
8506 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8507 terminal_coding->src_multibyte = 1;
8508 terminal_coding->dst_multibyte = 0;
8509 return Qnil;
8512 DEFUN ("set-safe-terminal-coding-system-internal",
8513 Fset_safe_terminal_coding_system_internal,
8514 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8515 doc: /* Internal use only. */)
8516 (coding_system)
8517 Lisp_Object coding_system;
8519 CHECK_SYMBOL (coding_system);
8520 setup_coding_system (Fcheck_coding_system (coding_system),
8521 &safe_terminal_coding);
8522 /* Characer composition should be disabled. */
8523 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8524 safe_terminal_coding.src_multibyte = 1;
8525 safe_terminal_coding.dst_multibyte = 0;
8526 return Qnil;
8529 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8530 Sterminal_coding_system, 0, 1, 0,
8531 doc: /* Return coding system specified for terminal output on the given terminal.
8532 TERMINAL may be a terminal id, a frame, or nil for the selected
8533 frame's terminal device. */)
8534 (terminal)
8535 Lisp_Object terminal;
8537 struct coding_system *terminal_coding
8538 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8539 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8541 /* For backward compatibility, return nil if it is `undecided'. */
8542 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8545 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8546 Sset_keyboard_coding_system_internal, 1, 2, 0,
8547 doc: /* Internal use only. */)
8548 (coding_system, terminal)
8549 Lisp_Object coding_system;
8550 Lisp_Object terminal;
8552 struct terminal *t = get_terminal (terminal, 1);
8553 CHECK_SYMBOL (coding_system);
8554 setup_coding_system (Fcheck_coding_system (coding_system),
8555 TERMINAL_KEYBOARD_CODING (t));
8556 /* Characer composition should be disabled. */
8557 TERMINAL_KEYBOARD_CODING (t)->common_flags
8558 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8559 return Qnil;
8562 DEFUN ("keyboard-coding-system",
8563 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8564 doc: /* Return coding system specified for decoding keyboard input. */)
8565 (terminal)
8566 Lisp_Object terminal;
8568 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8569 (get_terminal (terminal, 1))->id);
8573 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8574 Sfind_operation_coding_system, 1, MANY, 0,
8575 doc: /* Choose a coding system for an operation based on the target name.
8576 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8577 DECODING-SYSTEM is the coding system to use for decoding
8578 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8579 for encoding (in case OPERATION does encoding).
8581 The first argument OPERATION specifies an I/O primitive:
8582 For file I/O, `insert-file-contents' or `write-region'.
8583 For process I/O, `call-process', `call-process-region', or `start-process'.
8584 For network I/O, `open-network-stream'.
8586 The remaining arguments should be the same arguments that were passed
8587 to the primitive. Depending on which primitive, one of those arguments
8588 is selected as the TARGET. For example, if OPERATION does file I/O,
8589 whichever argument specifies the file name is TARGET.
8591 TARGET has a meaning which depends on OPERATION:
8592 For file I/O, TARGET is a file name (except for the special case below).
8593 For process I/O, TARGET is a process name.
8594 For network I/O, TARGET is a service name or a port number
8596 This function looks up what specified for TARGET in,
8597 `file-coding-system-alist', `process-coding-system-alist',
8598 or `network-coding-system-alist' depending on OPERATION.
8599 They may specify a coding system, a cons of coding systems,
8600 or a function symbol to call.
8601 In the last case, we call the function with one argument,
8602 which is a list of all the arguments given to this function.
8603 If the function can't decide a coding system, it can return
8604 `undecided' so that the normal code-detection is performed.
8606 If OPERATION is `insert-file-contents', the argument corresponding to
8607 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8608 file name to look up, and BUFFER is a buffer that contains the file's
8609 contents (not yet decoded). If `file-coding-system-alist' specifies a
8610 function to call for FILENAME, that function should examine the
8611 contents of BUFFER instead of reading the file.
8613 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
8614 (nargs, args)
8615 int nargs;
8616 Lisp_Object *args;
8618 Lisp_Object operation, target_idx, target, val;
8619 register Lisp_Object chain;
8621 if (nargs < 2)
8622 error ("Too few arguments");
8623 operation = args[0];
8624 if (!SYMBOLP (operation)
8625 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8626 error ("Invalid first arguement");
8627 if (nargs < 1 + XINT (target_idx))
8628 error ("Too few arguments for operation: %s",
8629 SDATA (SYMBOL_NAME (operation)));
8630 target = args[XINT (target_idx) + 1];
8631 if (!(STRINGP (target)
8632 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8633 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8634 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8635 error ("Invalid %dth argument", XINT (target_idx) + 1);
8636 if (CONSP (target))
8637 target = XCAR (target);
8639 chain = ((EQ (operation, Qinsert_file_contents)
8640 || EQ (operation, Qwrite_region))
8641 ? Vfile_coding_system_alist
8642 : (EQ (operation, Qopen_network_stream)
8643 ? Vnetwork_coding_system_alist
8644 : Vprocess_coding_system_alist));
8645 if (NILP (chain))
8646 return Qnil;
8648 for (; CONSP (chain); chain = XCDR (chain))
8650 Lisp_Object elt;
8652 elt = XCAR (chain);
8653 if (CONSP (elt)
8654 && ((STRINGP (target)
8655 && STRINGP (XCAR (elt))
8656 && fast_string_match (XCAR (elt), target) >= 0)
8657 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8659 val = XCDR (elt);
8660 /* Here, if VAL is both a valid coding system and a valid
8661 function symbol, we return VAL as a coding system. */
8662 if (CONSP (val))
8663 return val;
8664 if (! SYMBOLP (val))
8665 return Qnil;
8666 if (! NILP (Fcoding_system_p (val)))
8667 return Fcons (val, val);
8668 if (! NILP (Ffboundp (val)))
8670 /* We use call1 rather than safe_call1
8671 so as to get bug reports about functions called here
8672 which don't handle the current interface. */
8673 val = call1 (val, Flist (nargs, args));
8674 if (CONSP (val))
8675 return val;
8676 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8677 return Fcons (val, val);
8679 return Qnil;
8682 return Qnil;
8685 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8686 Sset_coding_system_priority, 0, MANY, 0,
8687 doc: /* Assign higher priority to the coding systems given as arguments.
8688 If multiple coding systems belongs to the same category,
8689 all but the first one are ignored.
8691 usage: (set-coding-system-priority ...) */)
8692 (nargs, args)
8693 int nargs;
8694 Lisp_Object *args;
8696 int i, j;
8697 int changed[coding_category_max];
8698 enum coding_category priorities[coding_category_max];
8700 bzero (changed, sizeof changed);
8702 for (i = j = 0; i < nargs; i++)
8704 enum coding_category category;
8705 Lisp_Object spec, attrs;
8707 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8708 attrs = AREF (spec, 0);
8709 category = XINT (CODING_ATTR_CATEGORY (attrs));
8710 if (changed[category])
8711 /* Ignore this coding system because a coding system of the
8712 same category already had a higher priority. */
8713 continue;
8714 changed[category] = 1;
8715 priorities[j++] = category;
8716 if (coding_categories[category].id >= 0
8717 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8718 setup_coding_system (args[i], &coding_categories[category]);
8719 Fset (AREF (Vcoding_category_table, category), args[i]);
8722 /* Now we have decided top J priorities. Reflect the order of the
8723 original priorities to the remaining priorities. */
8725 for (i = j, j = 0; i < coding_category_max; i++, j++)
8727 while (j < coding_category_max
8728 && changed[coding_priorities[j]])
8729 j++;
8730 if (j == coding_category_max)
8731 abort ();
8732 priorities[i] = coding_priorities[j];
8735 bcopy (priorities, coding_priorities, sizeof priorities);
8737 /* Update `coding-category-list'. */
8738 Vcoding_category_list = Qnil;
8739 for (i = coding_category_max - 1; i >= 0; i--)
8740 Vcoding_category_list
8741 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8742 Vcoding_category_list);
8744 return Qnil;
8747 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8748 Scoding_system_priority_list, 0, 1, 0,
8749 doc: /* Return a list of coding systems ordered by their priorities.
8750 HIGHESTP non-nil means just return the highest priority one. */)
8751 (highestp)
8752 Lisp_Object highestp;
8754 int i;
8755 Lisp_Object val;
8757 for (i = 0, val = Qnil; i < coding_category_max; i++)
8759 enum coding_category category = coding_priorities[i];
8760 int id = coding_categories[category].id;
8761 Lisp_Object attrs;
8763 if (id < 0)
8764 continue;
8765 attrs = CODING_ID_ATTRS (id);
8766 if (! NILP (highestp))
8767 return CODING_ATTR_BASE_NAME (attrs);
8768 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8770 return Fnreverse (val);
8773 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8775 static Lisp_Object
8776 make_subsidiaries (base)
8777 Lisp_Object base;
8779 Lisp_Object subsidiaries;
8780 int base_name_len = SBYTES (SYMBOL_NAME (base));
8781 char *buf = (char *) alloca (base_name_len + 6);
8782 int i;
8784 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8785 subsidiaries = Fmake_vector (make_number (3), Qnil);
8786 for (i = 0; i < 3; i++)
8788 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8789 ASET (subsidiaries, i, intern (buf));
8791 return subsidiaries;
8795 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8796 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8797 doc: /* For internal use only.
8798 usage: (define-coding-system-internal ...) */)
8799 (nargs, args)
8800 int nargs;
8801 Lisp_Object *args;
8803 Lisp_Object name;
8804 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8805 Lisp_Object attrs; /* Vector of attributes. */
8806 Lisp_Object eol_type;
8807 Lisp_Object aliases;
8808 Lisp_Object coding_type, charset_list, safe_charsets;
8809 enum coding_category category;
8810 Lisp_Object tail, val;
8811 int max_charset_id = 0;
8812 int i;
8814 if (nargs < coding_arg_max)
8815 goto short_args;
8817 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8819 name = args[coding_arg_name];
8820 CHECK_SYMBOL (name);
8821 CODING_ATTR_BASE_NAME (attrs) = name;
8823 val = args[coding_arg_mnemonic];
8824 if (! STRINGP (val))
8825 CHECK_CHARACTER (val);
8826 CODING_ATTR_MNEMONIC (attrs) = val;
8828 coding_type = args[coding_arg_coding_type];
8829 CHECK_SYMBOL (coding_type);
8830 CODING_ATTR_TYPE (attrs) = coding_type;
8832 charset_list = args[coding_arg_charset_list];
8833 if (SYMBOLP (charset_list))
8835 if (EQ (charset_list, Qiso_2022))
8837 if (! EQ (coding_type, Qiso_2022))
8838 error ("Invalid charset-list");
8839 charset_list = Viso_2022_charset_list;
8841 else if (EQ (charset_list, Qemacs_mule))
8843 if (! EQ (coding_type, Qemacs_mule))
8844 error ("Invalid charset-list");
8845 charset_list = Vemacs_mule_charset_list;
8847 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8848 if (max_charset_id < XFASTINT (XCAR (tail)))
8849 max_charset_id = XFASTINT (XCAR (tail));
8851 else
8853 charset_list = Fcopy_sequence (charset_list);
8854 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8856 struct charset *charset;
8858 val = XCAR (tail);
8859 CHECK_CHARSET_GET_CHARSET (val, charset);
8860 if (EQ (coding_type, Qiso_2022)
8861 ? CHARSET_ISO_FINAL (charset) < 0
8862 : EQ (coding_type, Qemacs_mule)
8863 ? CHARSET_EMACS_MULE_ID (charset) < 0
8864 : 0)
8865 error ("Can't handle charset `%s'",
8866 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8868 XSETCAR (tail, make_number (charset->id));
8869 if (max_charset_id < charset->id)
8870 max_charset_id = charset->id;
8873 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8875 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8876 make_number (255));
8877 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8878 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8879 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8881 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8883 val = args[coding_arg_decode_translation_table];
8884 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8885 CHECK_SYMBOL (val);
8886 CODING_ATTR_DECODE_TBL (attrs) = val;
8888 val = args[coding_arg_encode_translation_table];
8889 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8890 CHECK_SYMBOL (val);
8891 CODING_ATTR_ENCODE_TBL (attrs) = val;
8893 val = args[coding_arg_post_read_conversion];
8894 CHECK_SYMBOL (val);
8895 CODING_ATTR_POST_READ (attrs) = val;
8897 val = args[coding_arg_pre_write_conversion];
8898 CHECK_SYMBOL (val);
8899 CODING_ATTR_PRE_WRITE (attrs) = val;
8901 val = args[coding_arg_default_char];
8902 if (NILP (val))
8903 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8904 else
8906 CHECK_CHARACTER (val);
8907 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8910 val = args[coding_arg_for_unibyte];
8911 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8913 val = args[coding_arg_plist];
8914 CHECK_LIST (val);
8915 CODING_ATTR_PLIST (attrs) = val;
8917 if (EQ (coding_type, Qcharset))
8919 /* Generate a lisp vector of 256 elements. Each element is nil,
8920 integer, or a list of charset IDs.
8922 If Nth element is nil, the byte code N is invalid in this
8923 coding system.
8925 If Nth element is a number NUM, N is the first byte of a
8926 charset whose ID is NUM.
8928 If Nth element is a list of charset IDs, N is the first byte
8929 of one of them. The list is sorted by dimensions of the
8930 charsets. A charset of smaller dimension comes firtst. */
8931 val = Fmake_vector (make_number (256), Qnil);
8933 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8935 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8936 int dim = CHARSET_DIMENSION (charset);
8937 int idx = (dim - 1) * 4;
8939 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8940 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8942 for (i = charset->code_space[idx];
8943 i <= charset->code_space[idx + 1]; i++)
8945 Lisp_Object tmp, tmp2;
8946 int dim2;
8948 tmp = AREF (val, i);
8949 if (NILP (tmp))
8950 tmp = XCAR (tail);
8951 else if (NUMBERP (tmp))
8953 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8954 if (dim < dim2)
8955 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8956 else
8957 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8959 else
8961 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8963 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8964 if (dim < dim2)
8965 break;
8967 if (NILP (tmp2))
8968 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8969 else
8971 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8972 XSETCAR (tmp2, XCAR (tail));
8975 ASET (val, i, tmp);
8978 ASET (attrs, coding_attr_charset_valids, val);
8979 category = coding_category_charset;
8981 else if (EQ (coding_type, Qccl))
8983 Lisp_Object valids;
8985 if (nargs < coding_arg_ccl_max)
8986 goto short_args;
8988 val = args[coding_arg_ccl_decoder];
8989 CHECK_CCL_PROGRAM (val);
8990 if (VECTORP (val))
8991 val = Fcopy_sequence (val);
8992 ASET (attrs, coding_attr_ccl_decoder, val);
8994 val = args[coding_arg_ccl_encoder];
8995 CHECK_CCL_PROGRAM (val);
8996 if (VECTORP (val))
8997 val = Fcopy_sequence (val);
8998 ASET (attrs, coding_attr_ccl_encoder, val);
9000 val = args[coding_arg_ccl_valids];
9001 valids = Fmake_string (make_number (256), make_number (0));
9002 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9004 int from, to;
9006 val = Fcar (tail);
9007 if (INTEGERP (val))
9009 from = to = XINT (val);
9010 if (from < 0 || from > 255)
9011 args_out_of_range_3 (val, make_number (0), make_number (255));
9013 else
9015 CHECK_CONS (val);
9016 CHECK_NATNUM_CAR (val);
9017 CHECK_NATNUM_CDR (val);
9018 from = XINT (XCAR (val));
9019 if (from > 255)
9020 args_out_of_range_3 (XCAR (val),
9021 make_number (0), make_number (255));
9022 to = XINT (XCDR (val));
9023 if (to < from || to > 255)
9024 args_out_of_range_3 (XCDR (val),
9025 XCAR (val), make_number (255));
9027 for (i = from; i <= to; i++)
9028 SSET (valids, i, 1);
9030 ASET (attrs, coding_attr_ccl_valids, valids);
9032 category = coding_category_ccl;
9034 else if (EQ (coding_type, Qutf_16))
9036 Lisp_Object bom, endian;
9038 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9040 if (nargs < coding_arg_utf16_max)
9041 goto short_args;
9043 bom = args[coding_arg_utf16_bom];
9044 if (! NILP (bom) && ! EQ (bom, Qt))
9046 CHECK_CONS (bom);
9047 val = XCAR (bom);
9048 CHECK_CODING_SYSTEM (val);
9049 val = XCDR (bom);
9050 CHECK_CODING_SYSTEM (val);
9052 ASET (attrs, coding_attr_utf_16_bom, bom);
9054 endian = args[coding_arg_utf16_endian];
9055 CHECK_SYMBOL (endian);
9056 if (NILP (endian))
9057 endian = Qbig;
9058 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9059 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9060 ASET (attrs, coding_attr_utf_16_endian, endian);
9062 category = (CONSP (bom)
9063 ? coding_category_utf_16_auto
9064 : NILP (bom)
9065 ? (EQ (endian, Qbig)
9066 ? coding_category_utf_16_be_nosig
9067 : coding_category_utf_16_le_nosig)
9068 : (EQ (endian, Qbig)
9069 ? coding_category_utf_16_be
9070 : coding_category_utf_16_le));
9072 else if (EQ (coding_type, Qiso_2022))
9074 Lisp_Object initial, reg_usage, request, flags;
9075 int i;
9077 if (nargs < coding_arg_iso2022_max)
9078 goto short_args;
9080 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9081 CHECK_VECTOR (initial);
9082 for (i = 0; i < 4; i++)
9084 val = Faref (initial, make_number (i));
9085 if (! NILP (val))
9087 struct charset *charset;
9089 CHECK_CHARSET_GET_CHARSET (val, charset);
9090 ASET (initial, i, make_number (CHARSET_ID (charset)));
9091 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9092 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9094 else
9095 ASET (initial, i, make_number (-1));
9098 reg_usage = args[coding_arg_iso2022_reg_usage];
9099 CHECK_CONS (reg_usage);
9100 CHECK_NUMBER_CAR (reg_usage);
9101 CHECK_NUMBER_CDR (reg_usage);
9103 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9104 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9106 int id;
9107 Lisp_Object tmp;
9109 val = Fcar (tail);
9110 CHECK_CONS (val);
9111 tmp = XCAR (val);
9112 CHECK_CHARSET_GET_ID (tmp, id);
9113 CHECK_NATNUM_CDR (val);
9114 if (XINT (XCDR (val)) >= 4)
9115 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9116 XSETCAR (val, make_number (id));
9119 flags = args[coding_arg_iso2022_flags];
9120 CHECK_NATNUM (flags);
9121 i = XINT (flags);
9122 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9123 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9125 ASET (attrs, coding_attr_iso_initial, initial);
9126 ASET (attrs, coding_attr_iso_usage, reg_usage);
9127 ASET (attrs, coding_attr_iso_request, request);
9128 ASET (attrs, coding_attr_iso_flags, flags);
9129 setup_iso_safe_charsets (attrs);
9131 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9132 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9133 | CODING_ISO_FLAG_SINGLE_SHIFT))
9134 ? coding_category_iso_7_else
9135 : EQ (args[coding_arg_charset_list], Qiso_2022)
9136 ? coding_category_iso_7
9137 : coding_category_iso_7_tight);
9138 else
9140 int id = XINT (AREF (initial, 1));
9142 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9143 || EQ (args[coding_arg_charset_list], Qiso_2022)
9144 || id < 0)
9145 ? coding_category_iso_8_else
9146 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9147 ? coding_category_iso_8_1
9148 : coding_category_iso_8_2);
9150 if (category != coding_category_iso_8_1
9151 && category != coding_category_iso_8_2)
9152 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9154 else if (EQ (coding_type, Qemacs_mule))
9156 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9157 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9158 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9159 category = coding_category_emacs_mule;
9161 else if (EQ (coding_type, Qshift_jis))
9164 struct charset *charset;
9166 if (XINT (Flength (charset_list)) != 3
9167 && XINT (Flength (charset_list)) != 4)
9168 error ("There should be three or four charsets");
9170 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9171 if (CHARSET_DIMENSION (charset) != 1)
9172 error ("Dimension of charset %s is not one",
9173 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9174 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9175 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9177 charset_list = XCDR (charset_list);
9178 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9179 if (CHARSET_DIMENSION (charset) != 1)
9180 error ("Dimension of charset %s is not one",
9181 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9183 charset_list = XCDR (charset_list);
9184 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9185 if (CHARSET_DIMENSION (charset) != 2)
9186 error ("Dimension of charset %s is not two",
9187 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9189 charset_list = XCDR (charset_list);
9190 if (! NILP (charset_list))
9192 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9193 if (CHARSET_DIMENSION (charset) != 2)
9194 error ("Dimension of charset %s is not two",
9195 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9198 category = coding_category_sjis;
9199 Vsjis_coding_system = name;
9201 else if (EQ (coding_type, Qbig5))
9203 struct charset *charset;
9205 if (XINT (Flength (charset_list)) != 2)
9206 error ("There should be just two charsets");
9208 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9209 if (CHARSET_DIMENSION (charset) != 1)
9210 error ("Dimension of charset %s is not one",
9211 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9212 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9213 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9215 charset_list = XCDR (charset_list);
9216 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9217 if (CHARSET_DIMENSION (charset) != 2)
9218 error ("Dimension of charset %s is not two",
9219 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9221 category = coding_category_big5;
9222 Vbig5_coding_system = name;
9224 else if (EQ (coding_type, Qraw_text))
9226 category = coding_category_raw_text;
9227 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9229 else if (EQ (coding_type, Qutf_8))
9231 category = coding_category_utf_8;
9232 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9234 else if (EQ (coding_type, Qundecided))
9235 category = coding_category_undecided;
9236 else
9237 error ("Invalid coding system type: %s",
9238 SDATA (SYMBOL_NAME (coding_type)));
9240 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9241 CODING_ATTR_PLIST (attrs)
9242 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9243 CODING_ATTR_PLIST (attrs)));
9244 CODING_ATTR_PLIST (attrs)
9245 = Fcons (QCascii_compatible_p,
9246 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9247 CODING_ATTR_PLIST (attrs)));
9249 eol_type = args[coding_arg_eol_type];
9250 if (! NILP (eol_type)
9251 && ! EQ (eol_type, Qunix)
9252 && ! EQ (eol_type, Qdos)
9253 && ! EQ (eol_type, Qmac))
9254 error ("Invalid eol-type");
9256 aliases = Fcons (name, Qnil);
9258 if (NILP (eol_type))
9260 eol_type = make_subsidiaries (name);
9261 for (i = 0; i < 3; i++)
9263 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9265 this_name = AREF (eol_type, i);
9266 this_aliases = Fcons (this_name, Qnil);
9267 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9268 this_spec = Fmake_vector (make_number (3), attrs);
9269 ASET (this_spec, 1, this_aliases);
9270 ASET (this_spec, 2, this_eol_type);
9271 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9272 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9273 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9274 if (NILP (val))
9275 Vcoding_system_alist
9276 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9277 Vcoding_system_alist);
9281 spec_vec = Fmake_vector (make_number (3), attrs);
9282 ASET (spec_vec, 1, aliases);
9283 ASET (spec_vec, 2, eol_type);
9285 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9286 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9287 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9288 if (NILP (val))
9289 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9290 Vcoding_system_alist);
9293 int id = coding_categories[category].id;
9295 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9296 setup_coding_system (name, &coding_categories[category]);
9299 return Qnil;
9301 short_args:
9302 return Fsignal (Qwrong_number_of_arguments,
9303 Fcons (intern ("define-coding-system-internal"),
9304 make_number (nargs)));
9308 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9309 3, 3, 0,
9310 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9311 (coding_system, prop, val)
9312 Lisp_Object coding_system, prop, val;
9314 Lisp_Object spec, attrs;
9316 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9317 attrs = AREF (spec, 0);
9318 if (EQ (prop, QCmnemonic))
9320 if (! STRINGP (val))
9321 CHECK_CHARACTER (val);
9322 CODING_ATTR_MNEMONIC (attrs) = val;
9324 else if (EQ (prop, QCdefalut_char))
9326 if (NILP (val))
9327 val = make_number (' ');
9328 else
9329 CHECK_CHARACTER (val);
9330 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9332 else if (EQ (prop, QCdecode_translation_table))
9334 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9335 CHECK_SYMBOL (val);
9336 CODING_ATTR_DECODE_TBL (attrs) = val;
9338 else if (EQ (prop, QCencode_translation_table))
9340 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9341 CHECK_SYMBOL (val);
9342 CODING_ATTR_ENCODE_TBL (attrs) = val;
9344 else if (EQ (prop, QCpost_read_conversion))
9346 CHECK_SYMBOL (val);
9347 CODING_ATTR_POST_READ (attrs) = val;
9349 else if (EQ (prop, QCpre_write_conversion))
9351 CHECK_SYMBOL (val);
9352 CODING_ATTR_PRE_WRITE (attrs) = val;
9354 else if (EQ (prop, QCascii_compatible_p))
9356 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9359 CODING_ATTR_PLIST (attrs)
9360 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9361 return val;
9365 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9366 Sdefine_coding_system_alias, 2, 2, 0,
9367 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9368 (alias, coding_system)
9369 Lisp_Object alias, coding_system;
9371 Lisp_Object spec, aliases, eol_type, val;
9373 CHECK_SYMBOL (alias);
9374 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9375 aliases = AREF (spec, 1);
9376 /* ALISES should be a list of length more than zero, and the first
9377 element is a base coding system. Append ALIAS at the tail of the
9378 list. */
9379 while (!NILP (XCDR (aliases)))
9380 aliases = XCDR (aliases);
9381 XSETCDR (aliases, Fcons (alias, Qnil));
9383 eol_type = AREF (spec, 2);
9384 if (VECTORP (eol_type))
9386 Lisp_Object subsidiaries;
9387 int i;
9389 subsidiaries = make_subsidiaries (alias);
9390 for (i = 0; i < 3; i++)
9391 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9392 AREF (eol_type, i));
9395 Fputhash (alias, spec, Vcoding_system_hash_table);
9396 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9397 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9398 if (NILP (val))
9399 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9400 Vcoding_system_alist);
9402 return Qnil;
9405 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9406 1, 1, 0,
9407 doc: /* Return the base of CODING-SYSTEM.
9408 Any alias or subsidiary coding system is not a base coding system. */)
9409 (coding_system)
9410 Lisp_Object coding_system;
9412 Lisp_Object spec, attrs;
9414 if (NILP (coding_system))
9415 return (Qno_conversion);
9416 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9417 attrs = AREF (spec, 0);
9418 return CODING_ATTR_BASE_NAME (attrs);
9421 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9422 1, 1, 0,
9423 doc: "Return the property list of CODING-SYSTEM.")
9424 (coding_system)
9425 Lisp_Object coding_system;
9427 Lisp_Object spec, attrs;
9429 if (NILP (coding_system))
9430 coding_system = Qno_conversion;
9431 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9432 attrs = AREF (spec, 0);
9433 return CODING_ATTR_PLIST (attrs);
9437 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9438 1, 1, 0,
9439 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9440 (coding_system)
9441 Lisp_Object coding_system;
9443 Lisp_Object spec;
9445 if (NILP (coding_system))
9446 coding_system = Qno_conversion;
9447 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9448 return AREF (spec, 1);
9451 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9452 Scoding_system_eol_type, 1, 1, 0,
9453 doc: /* Return eol-type of CODING-SYSTEM.
9454 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9456 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9457 and CR respectively.
9459 A vector value indicates that a format of end-of-line should be
9460 detected automatically. Nth element of the vector is the subsidiary
9461 coding system whose eol-type is N. */)
9462 (coding_system)
9463 Lisp_Object coding_system;
9465 Lisp_Object spec, eol_type;
9466 int n;
9468 if (NILP (coding_system))
9469 coding_system = Qno_conversion;
9470 if (! CODING_SYSTEM_P (coding_system))
9471 return Qnil;
9472 spec = CODING_SYSTEM_SPEC (coding_system);
9473 eol_type = AREF (spec, 2);
9474 if (VECTORP (eol_type))
9475 return Fcopy_sequence (eol_type);
9476 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9477 return make_number (n);
9480 #endif /* emacs */
9483 /*** 9. Post-amble ***/
9485 void
9486 init_coding_once ()
9488 int i;
9490 for (i = 0; i < coding_category_max; i++)
9492 coding_categories[i].id = -1;
9493 coding_priorities[i] = i;
9496 /* ISO2022 specific initialize routine. */
9497 for (i = 0; i < 0x20; i++)
9498 iso_code_class[i] = ISO_control_0;
9499 for (i = 0x21; i < 0x7F; i++)
9500 iso_code_class[i] = ISO_graphic_plane_0;
9501 for (i = 0x80; i < 0xA0; i++)
9502 iso_code_class[i] = ISO_control_1;
9503 for (i = 0xA1; i < 0xFF; i++)
9504 iso_code_class[i] = ISO_graphic_plane_1;
9505 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9506 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9507 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9508 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9509 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9510 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9511 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9512 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9513 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9515 for (i = 0; i < 256; i++)
9517 emacs_mule_bytes[i] = 1;
9519 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9520 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9521 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9522 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9525 #ifdef emacs
9527 void
9528 syms_of_coding ()
9530 staticpro (&Vcoding_system_hash_table);
9532 Lisp_Object args[2];
9533 args[0] = QCtest;
9534 args[1] = Qeq;
9535 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9538 staticpro (&Vsjis_coding_system);
9539 Vsjis_coding_system = Qnil;
9541 staticpro (&Vbig5_coding_system);
9542 Vbig5_coding_system = Qnil;
9544 staticpro (&Vcode_conversion_reused_workbuf);
9545 Vcode_conversion_reused_workbuf = Qnil;
9547 staticpro (&Vcode_conversion_workbuf_name);
9548 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9550 reused_workbuf_in_use = 0;
9552 DEFSYM (Qcharset, "charset");
9553 DEFSYM (Qtarget_idx, "target-idx");
9554 DEFSYM (Qcoding_system_history, "coding-system-history");
9555 Fset (Qcoding_system_history, Qnil);
9557 /* Target FILENAME is the first argument. */
9558 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9559 /* Target FILENAME is the third argument. */
9560 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9562 DEFSYM (Qcall_process, "call-process");
9563 /* Target PROGRAM is the first argument. */
9564 Fput (Qcall_process, Qtarget_idx, make_number (0));
9566 DEFSYM (Qcall_process_region, "call-process-region");
9567 /* Target PROGRAM is the third argument. */
9568 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9570 DEFSYM (Qstart_process, "start-process");
9571 /* Target PROGRAM is the third argument. */
9572 Fput (Qstart_process, Qtarget_idx, make_number (2));
9574 DEFSYM (Qopen_network_stream, "open-network-stream");
9575 /* Target SERVICE is the fourth argument. */
9576 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9578 DEFSYM (Qcoding_system, "coding-system");
9579 DEFSYM (Qcoding_aliases, "coding-aliases");
9581 DEFSYM (Qeol_type, "eol-type");
9582 DEFSYM (Qunix, "unix");
9583 DEFSYM (Qdos, "dos");
9585 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9586 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9587 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9588 DEFSYM (Qdefault_char, "default-char");
9589 DEFSYM (Qundecided, "undecided");
9590 DEFSYM (Qno_conversion, "no-conversion");
9591 DEFSYM (Qraw_text, "raw-text");
9593 DEFSYM (Qiso_2022, "iso-2022");
9595 DEFSYM (Qutf_8, "utf-8");
9596 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9598 DEFSYM (Qutf_16, "utf-16");
9599 DEFSYM (Qbig, "big");
9600 DEFSYM (Qlittle, "little");
9602 DEFSYM (Qshift_jis, "shift-jis");
9603 DEFSYM (Qbig5, "big5");
9605 DEFSYM (Qcoding_system_p, "coding-system-p");
9607 DEFSYM (Qcoding_system_error, "coding-system-error");
9608 Fput (Qcoding_system_error, Qerror_conditions,
9609 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9610 Fput (Qcoding_system_error, Qerror_message,
9611 build_string ("Invalid coding system"));
9613 /* Intern this now in case it isn't already done.
9614 Setting this variable twice is harmless.
9615 But don't staticpro it here--that is done in alloc.c. */
9616 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9618 DEFSYM (Qtranslation_table, "translation-table");
9619 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9620 DEFSYM (Qtranslation_table_id, "translation-table-id");
9621 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9622 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9624 DEFSYM (Qvalid_codes, "valid-codes");
9626 DEFSYM (Qemacs_mule, "emacs-mule");
9628 DEFSYM (QCcategory, ":category");
9629 DEFSYM (QCmnemonic, ":mnemonic");
9630 DEFSYM (QCdefalut_char, ":default-char");
9631 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9632 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9633 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9634 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9635 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9637 Vcoding_category_table
9638 = Fmake_vector (make_number (coding_category_max), Qnil);
9639 staticpro (&Vcoding_category_table);
9640 /* Followings are target of code detection. */
9641 ASET (Vcoding_category_table, coding_category_iso_7,
9642 intern ("coding-category-iso-7"));
9643 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9644 intern ("coding-category-iso-7-tight"));
9645 ASET (Vcoding_category_table, coding_category_iso_8_1,
9646 intern ("coding-category-iso-8-1"));
9647 ASET (Vcoding_category_table, coding_category_iso_8_2,
9648 intern ("coding-category-iso-8-2"));
9649 ASET (Vcoding_category_table, coding_category_iso_7_else,
9650 intern ("coding-category-iso-7-else"));
9651 ASET (Vcoding_category_table, coding_category_iso_8_else,
9652 intern ("coding-category-iso-8-else"));
9653 ASET (Vcoding_category_table, coding_category_utf_8,
9654 intern ("coding-category-utf-8"));
9655 ASET (Vcoding_category_table, coding_category_utf_16_be,
9656 intern ("coding-category-utf-16-be"));
9657 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9658 intern ("coding-category-utf-16-auto"));
9659 ASET (Vcoding_category_table, coding_category_utf_16_le,
9660 intern ("coding-category-utf-16-le"));
9661 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9662 intern ("coding-category-utf-16-be-nosig"));
9663 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9664 intern ("coding-category-utf-16-le-nosig"));
9665 ASET (Vcoding_category_table, coding_category_charset,
9666 intern ("coding-category-charset"));
9667 ASET (Vcoding_category_table, coding_category_sjis,
9668 intern ("coding-category-sjis"));
9669 ASET (Vcoding_category_table, coding_category_big5,
9670 intern ("coding-category-big5"));
9671 ASET (Vcoding_category_table, coding_category_ccl,
9672 intern ("coding-category-ccl"));
9673 ASET (Vcoding_category_table, coding_category_emacs_mule,
9674 intern ("coding-category-emacs-mule"));
9675 /* Followings are NOT target of code detection. */
9676 ASET (Vcoding_category_table, coding_category_raw_text,
9677 intern ("coding-category-raw-text"));
9678 ASET (Vcoding_category_table, coding_category_undecided,
9679 intern ("coding-category-undecided"));
9681 DEFSYM (Qinsufficient_source, "insufficient-source");
9682 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9683 DEFSYM (Qinvalid_source, "invalid-source");
9684 DEFSYM (Qinterrupted, "interrupted");
9685 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9686 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9688 defsubr (&Scoding_system_p);
9689 defsubr (&Sread_coding_system);
9690 defsubr (&Sread_non_nil_coding_system);
9691 defsubr (&Scheck_coding_system);
9692 defsubr (&Sdetect_coding_region);
9693 defsubr (&Sdetect_coding_string);
9694 defsubr (&Sfind_coding_systems_region_internal);
9695 defsubr (&Sunencodable_char_position);
9696 defsubr (&Scheck_coding_systems_region);
9697 defsubr (&Sdecode_coding_region);
9698 defsubr (&Sencode_coding_region);
9699 defsubr (&Sdecode_coding_string);
9700 defsubr (&Sencode_coding_string);
9701 defsubr (&Sdecode_sjis_char);
9702 defsubr (&Sencode_sjis_char);
9703 defsubr (&Sdecode_big5_char);
9704 defsubr (&Sencode_big5_char);
9705 defsubr (&Sset_terminal_coding_system_internal);
9706 defsubr (&Sset_safe_terminal_coding_system_internal);
9707 defsubr (&Sterminal_coding_system);
9708 defsubr (&Sset_keyboard_coding_system_internal);
9709 defsubr (&Skeyboard_coding_system);
9710 defsubr (&Sfind_operation_coding_system);
9711 defsubr (&Sset_coding_system_priority);
9712 defsubr (&Sdefine_coding_system_internal);
9713 defsubr (&Sdefine_coding_system_alias);
9714 defsubr (&Scoding_system_put);
9715 defsubr (&Scoding_system_base);
9716 defsubr (&Scoding_system_plist);
9717 defsubr (&Scoding_system_aliases);
9718 defsubr (&Scoding_system_eol_type);
9719 defsubr (&Scoding_system_priority_list);
9721 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9722 doc: /* List of coding systems.
9724 Do not alter the value of this variable manually. This variable should be
9725 updated by the functions `define-coding-system' and
9726 `define-coding-system-alias'. */);
9727 Vcoding_system_list = Qnil;
9729 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9730 doc: /* Alist of coding system names.
9731 Each element is one element list of coding system name.
9732 This variable is given to `completing-read' as TABLE argument.
9734 Do not alter the value of this variable manually. This variable should be
9735 updated by the functions `make-coding-system' and
9736 `define-coding-system-alias'. */);
9737 Vcoding_system_alist = Qnil;
9739 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9740 doc: /* List of coding-categories (symbols) ordered by priority.
9742 On detecting a coding system, Emacs tries code detection algorithms
9743 associated with each coding-category one by one in this order. When
9744 one algorithm agrees with a byte sequence of source text, the coding
9745 system bound to the corresponding coding-category is selected.
9747 Don't modify this variable directly, but use `set-coding-priority'. */);
9749 int i;
9751 Vcoding_category_list = Qnil;
9752 for (i = coding_category_max - 1; i >= 0; i--)
9753 Vcoding_category_list
9754 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9755 Vcoding_category_list);
9758 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9759 doc: /* Specify the coding system for read operations.
9760 It is useful to bind this variable with `let', but do not set it globally.
9761 If the value is a coding system, it is used for decoding on read operation.
9762 If not, an appropriate element is used from one of the coding system alists:
9763 There are three such tables, `file-coding-system-alist',
9764 `process-coding-system-alist', and `network-coding-system-alist'. */);
9765 Vcoding_system_for_read = Qnil;
9767 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9768 doc: /* Specify the coding system for write operations.
9769 Programs bind this variable with `let', but you should not set it globally.
9770 If the value is a coding system, it is used for encoding of output,
9771 when writing it to a file and when sending it to a file or subprocess.
9773 If this does not specify a coding system, an appropriate element
9774 is used from one of the coding system alists:
9775 There are three such tables, `file-coding-system-alist',
9776 `process-coding-system-alist', and `network-coding-system-alist'.
9777 For output to files, if the above procedure does not specify a coding system,
9778 the value of `buffer-file-coding-system' is used. */);
9779 Vcoding_system_for_write = Qnil;
9781 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9782 doc: /*
9783 Coding system used in the latest file or process I/O. */);
9784 Vlast_coding_system_used = Qnil;
9786 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9787 doc: /*
9788 Error status of the last code conversion.
9790 When an error was detected in the last code conversion, this variable
9791 is set to one of the following symbols.
9792 `insufficient-source'
9793 `inconsistent-eol'
9794 `invalid-source'
9795 `interrupted'
9796 `insufficient-memory'
9797 When no error was detected, the value doesn't change. So, to check
9798 the error status of a code conversion by this variable, you must
9799 explicitly set this variable to nil before performing code
9800 conversion. */);
9801 Vlast_code_conversion_error = Qnil;
9803 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9804 doc: /*
9805 *Non-nil means always inhibit code conversion of end-of-line format.
9806 See info node `Coding Systems' and info node `Text and Binary' concerning
9807 such conversion. */);
9808 inhibit_eol_conversion = 0;
9810 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9811 doc: /*
9812 Non-nil means process buffer inherits coding system of process output.
9813 Bind it to t if the process output is to be treated as if it were a file
9814 read from some filesystem. */);
9815 inherit_process_coding_system = 0;
9817 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9818 doc: /*
9819 Alist to decide a coding system to use for a file I/O operation.
9820 The format is ((PATTERN . VAL) ...),
9821 where PATTERN is a regular expression matching a file name,
9822 VAL is a coding system, a cons of coding systems, or a function symbol.
9823 If VAL is a coding system, it is used for both decoding and encoding
9824 the file contents.
9825 If VAL is a cons of coding systems, the car part is used for decoding,
9826 and the cdr part is used for encoding.
9827 If VAL is a function symbol, the function must return a coding system
9828 or a cons of coding systems which are used as above. The function is
9829 called with an argument that is a list of the arguments with which
9830 `find-operation-coding-system' was called. If the function can't decide
9831 a coding system, it can return `undecided' so that the normal
9832 code-detection is performed.
9834 See also the function `find-operation-coding-system'
9835 and the variable `auto-coding-alist'. */);
9836 Vfile_coding_system_alist = Qnil;
9838 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9839 doc: /*
9840 Alist to decide a coding system to use for a process I/O operation.
9841 The format is ((PATTERN . VAL) ...),
9842 where PATTERN is a regular expression matching a program name,
9843 VAL is a coding system, a cons of coding systems, or a function symbol.
9844 If VAL is a coding system, it is used for both decoding what received
9845 from the program and encoding what sent to the program.
9846 If VAL is a cons of coding systems, the car part is used for decoding,
9847 and the cdr part is used for encoding.
9848 If VAL is a function symbol, the function must return a coding system
9849 or a cons of coding systems which are used as above.
9851 See also the function `find-operation-coding-system'. */);
9852 Vprocess_coding_system_alist = Qnil;
9854 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9855 doc: /*
9856 Alist to decide a coding system to use for a network I/O operation.
9857 The format is ((PATTERN . VAL) ...),
9858 where PATTERN is a regular expression matching a network service name
9859 or is a port number to connect to,
9860 VAL is a coding system, a cons of coding systems, or a function symbol.
9861 If VAL is a coding system, it is used for both decoding what received
9862 from the network stream and encoding what sent to the network stream.
9863 If VAL is a cons of coding systems, the car part is used for decoding,
9864 and the cdr part is used for encoding.
9865 If VAL is a function symbol, the function must return a coding system
9866 or a cons of coding systems which are used as above.
9868 See also the function `find-operation-coding-system'. */);
9869 Vnetwork_coding_system_alist = Qnil;
9871 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9872 doc: /* Coding system to use with system messages.
9873 Also used for decoding keyboard input on X Window system. */);
9874 Vlocale_coding_system = Qnil;
9876 /* The eol mnemonics are reset in startup.el system-dependently. */
9877 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9878 doc: /*
9879 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9880 eol_mnemonic_unix = build_string (":");
9882 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9883 doc: /*
9884 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9885 eol_mnemonic_dos = build_string ("\\");
9887 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9888 doc: /*
9889 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9890 eol_mnemonic_mac = build_string ("/");
9892 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9893 doc: /*
9894 *String displayed in mode line when end-of-line format is not yet determined. */);
9895 eol_mnemonic_undecided = build_string (":");
9897 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9898 doc: /*
9899 *Non-nil enables character translation while encoding and decoding. */);
9900 Venable_character_translation = Qt;
9902 DEFVAR_LISP ("standard-translation-table-for-decode",
9903 &Vstandard_translation_table_for_decode,
9904 doc: /* Table for translating characters while decoding. */);
9905 Vstandard_translation_table_for_decode = Qnil;
9907 DEFVAR_LISP ("standard-translation-table-for-encode",
9908 &Vstandard_translation_table_for_encode,
9909 doc: /* Table for translating characters while encoding. */);
9910 Vstandard_translation_table_for_encode = Qnil;
9912 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9913 doc: /* Alist of charsets vs revision numbers.
9914 While encoding, if a charset (car part of an element) is found,
9915 designate it with the escape sequence identifying revision (cdr part
9916 of the element). */);
9917 Vcharset_revision_table = Qnil;
9919 DEFVAR_LISP ("default-process-coding-system",
9920 &Vdefault_process_coding_system,
9921 doc: /* Cons of coding systems used for process I/O by default.
9922 The car part is used for decoding a process output,
9923 the cdr part is used for encoding a text to be sent to a process. */);
9924 Vdefault_process_coding_system = Qnil;
9926 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9927 doc: /*
9928 Table of extra Latin codes in the range 128..159 (inclusive).
9929 This is a vector of length 256.
9930 If Nth element is non-nil, the existence of code N in a file
9931 \(or output of subprocess) doesn't prevent it to be detected as
9932 a coding system of ISO 2022 variant which has a flag
9933 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9934 or reading output of a subprocess.
9935 Only 128th through 159th elements has a meaning. */);
9936 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9938 DEFVAR_LISP ("select-safe-coding-system-function",
9939 &Vselect_safe_coding_system_function,
9940 doc: /*
9941 Function to call to select safe coding system for encoding a text.
9943 If set, this function is called to force a user to select a proper
9944 coding system which can encode the text in the case that a default
9945 coding system used in each operation can't encode the text. The
9946 function should take care that the buffer is not modified while
9947 the coding system is being selected.
9949 The default value is `select-safe-coding-system' (which see). */);
9950 Vselect_safe_coding_system_function = Qnil;
9952 DEFVAR_BOOL ("coding-system-require-warning",
9953 &coding_system_require_warning,
9954 doc: /* Internal use only.
9955 If non-nil, on writing a file, `select-safe-coding-system-function' is
9956 called even if `coding-system-for-write' is non-nil. The command
9957 `universal-coding-system-argument' binds this variable to t temporarily. */);
9958 coding_system_require_warning = 0;
9961 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9962 &inhibit_iso_escape_detection,
9963 doc: /*
9964 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9966 By default, on reading a file, Emacs tries to detect how the text is
9967 encoded. This code detection is sensitive to escape sequences. If
9968 the sequence is valid as ISO2022, the code is determined as one of
9969 the ISO2022 encodings, and the file is decoded by the corresponding
9970 coding system (e.g. `iso-2022-7bit').
9972 However, there may be a case that you want to read escape sequences in
9973 a file as is. In such a case, you can set this variable to non-nil.
9974 Then, as the code detection ignores any escape sequences, no file is
9975 detected as encoded in some ISO2022 encoding. The result is that all
9976 escape sequences become visible in a buffer.
9978 The default value is nil, and it is strongly recommended not to change
9979 it. That is because many Emacs Lisp source files that contain
9980 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9981 in Emacs's distribution, and they won't be decoded correctly on
9982 reading if you suppress escape sequence detection.
9984 The other way to read escape sequences in a file without decoding is
9985 to explicitly specify some coding system that doesn't use ISO2022's
9986 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9987 inhibit_iso_escape_detection = 0;
9989 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9990 doc: /* Char table for translating self-inserting characters.
9991 This is applied to the result of input methods, not their input. See also
9992 `keyboard-translate-table'. */);
9993 Vtranslation_table_for_input = Qnil;
9996 Lisp_Object args[coding_arg_max];
9997 Lisp_Object plist[16];
9998 int i;
10000 for (i = 0; i < coding_arg_max; i++)
10001 args[i] = Qnil;
10003 plist[0] = intern (":name");
10004 plist[1] = args[coding_arg_name] = Qno_conversion;
10005 plist[2] = intern (":mnemonic");
10006 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10007 plist[4] = intern (":coding-type");
10008 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10009 plist[6] = intern (":ascii-compatible-p");
10010 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10011 plist[8] = intern (":default-char");
10012 plist[9] = args[coding_arg_default_char] = make_number (0);
10013 plist[10] = intern (":for-unibyte");
10014 plist[11] = args[coding_arg_for_unibyte] = Qt;
10015 plist[12] = intern (":docstring");
10016 plist[13] = build_string ("Do no conversion.\n\
10018 When you visit a file with this coding, the file is read into a\n\
10019 unibyte buffer as is, thus each byte of a file is treated as a\n\
10020 character.");
10021 plist[14] = intern (":eol-type");
10022 plist[15] = args[coding_arg_eol_type] = Qunix;
10023 args[coding_arg_plist] = Flist (16, plist);
10024 Fdefine_coding_system_internal (coding_arg_max, args);
10026 plist[1] = args[coding_arg_name] = Qundecided;
10027 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10028 plist[5] = args[coding_arg_coding_type] = Qundecided;
10029 /* This is already set.
10030 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10031 plist[8] = intern (":charset-list");
10032 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10033 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10034 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10035 plist[15] = args[coding_arg_eol_type] = Qnil;
10036 args[coding_arg_plist] = Flist (16, plist);
10037 Fdefine_coding_system_internal (coding_arg_max, args);
10040 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10043 int i;
10045 for (i = 0; i < coding_category_max; i++)
10046 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10048 #if defined (MSDOS) || defined (WINDOWSNT)
10049 system_eol_type = Qdos;
10050 #else
10051 system_eol_type = Qunix;
10052 #endif
10053 staticpro (&system_eol_type);
10056 char *
10057 emacs_strerror (error_number)
10058 int error_number;
10060 char *str;
10062 synchronize_system_messages_locale ();
10063 str = strerror (error_number);
10065 if (! NILP (Vlocale_coding_system))
10067 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10068 Vlocale_coding_system,
10070 str = (char *) SDATA (dec);
10073 return str;
10076 #endif /* emacs */
10078 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10079 (do not change this comment) */