Fix bug #13515 with processing DBCS file names on MS-Windows.
[emacs.git] / src / charset.h
blobd9a5662e520653c2831b181fd3ef960c812f6053
1 /* Header for charset handler.
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
12 This file is part of GNU Emacs.
14 GNU Emacs is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
27 #ifndef EMACS_CHARSET_H
28 #define EMACS_CHARSET_H
30 #include <verify.h>
32 INLINE_HEADER_BEGIN
33 #ifndef CHARSET_INLINE
34 # define CHARSET_INLINE INLINE
35 #endif
37 /* Index to arguments of Fdefine_charset_internal. */
39 enum define_charset_arg_index
41 charset_arg_name,
42 charset_arg_dimension,
43 charset_arg_code_space,
44 charset_arg_min_code,
45 charset_arg_max_code,
46 charset_arg_iso_final,
47 charset_arg_iso_revision,
48 charset_arg_emacs_mule_id,
49 charset_arg_ascii_compatible_p,
50 charset_arg_supplementary_p,
51 charset_arg_invalid_code,
52 charset_arg_code_offset,
53 charset_arg_map,
54 charset_arg_subset,
55 charset_arg_superset,
56 charset_arg_unify_map,
57 charset_arg_plist,
58 charset_arg_max
62 /* Indices to charset attributes vector. */
64 enum charset_attr_index
66 /* ID number of the charset. */
67 charset_id,
69 /* Name of the charset (symbol). */
70 charset_name,
72 /* Property list of the charset. */
73 charset_plist,
75 /* If the method of the charset is `MAP', the value is a mapping
76 vector or a file name that contains mapping vector. Otherwise,
77 nil. */
78 charset_map,
80 /* If the method of the charset is `MAP', the value is a vector
81 that maps code points of the charset to characters. The vector
82 is indexed by a character index. A character index is
83 calculated from a code point and the code-space table of the
84 charset. */
85 charset_decoder,
87 /* If the method of the charset is `MAP', the value is a
88 char-table that maps characters of the charset to code
89 points. */
90 charset_encoder,
92 /* If the method of the charset is `SUBSET', the value is a vector
93 that has this form:
95 [ CHARSET-ID MIN-CODE MAX-CODE OFFSET ]
97 CHARSET-ID is an ID number of a parent charset. MIN-CODE and
98 MAX-CODE specify the range of characters inherited from the
99 parent. OFFSET is an integer value to add to a code point of
100 the parent charset to get the corresponding code point of this
101 charset. */
102 charset_subset,
104 /* If the method of the charset is `SUPERSET', the value is a list
105 whose elements have this form:
107 (CHARSET-ID . OFFSET)
109 CHARSET-IDs are ID numbers of parent charsets. OFFSET is an
110 integer value to add to a code point of the parent charset to
111 get the corresponding code point of this charset. */
112 charset_superset,
114 /* The value is a mapping vector or a file name that contains the
115 mapping. This defines how characters in the charset should be
116 unified with Unicode. The value of the member
117 `charset_deunifier' is created from this information. */
118 charset_unify_map,
120 /* If characters in the charset must be unified Unicode, the value
121 is a char table that maps a unified Unicode character code to
122 the non-unified character code in the charset. */
123 charset_deunifier,
125 /* The length of the charset attribute vector. */
126 charset_attr_max
129 /* Methods for converting code points and characters of charsets. */
131 enum charset_method
133 /* For a charset of this method, a character code is calculated
134 from a character index (which is calculated from a code point)
135 simply by adding an offset value. */
136 CHARSET_METHOD_OFFSET,
138 /* For a charset of this method, a decoder vector and an encoder
139 char-table is used for code point <-> character code
140 conversion. */
141 CHARSET_METHOD_MAP,
143 /* A charset of this method is a subset of another charset. */
144 CHARSET_METHOD_SUBSET,
146 /* A charset of this method is a superset of other charsets. */
147 CHARSET_METHOD_SUPERSET
150 struct charset
152 /* Index to charset_table. */
153 int id;
155 /* Index to Vcharset_hash_table. */
156 ptrdiff_t hash_index;
158 /* Dimension of the charset: 1, 2, 3, or 4. */
159 int dimension;
161 /* Byte code range of each dimension. <code_space>[4N] is a minimum
162 byte code of the (N+1)th dimension, <code_space>[4N+1] is a
163 maximum byte code of the (N+1)th dimension, <code_space>[4N+2] is
164 (<code_space>[4N+1] - <code_space>[4N] + 1), <code_space>[4N+3]
165 is the number of characters contained in the first through (N+1)th
166 dimensions, except that there is no <code_space>[15].
167 We get `char-index' of a `code-point' from this
168 information. */
169 int code_space[15];
171 /* If B is a byte of Nth dimension of a code-point, the (N-1)th bit
172 of code_space_mask[B] is set. This array is used to quickly
173 check if a code-point is in a valid range. */
174 unsigned char *code_space_mask;
176 /* True if there's no gap in code-points. */
177 unsigned code_linear_p : 1;
179 /* True if the charset is treated as 96 chars in ISO-2022
180 as opposed to 94 chars. */
181 unsigned iso_chars_96 : 1;
183 /* True if the charset is compatible with ASCII. */
184 unsigned ascii_compatible_p : 1;
186 /* True if the charset is supplementary. */
187 unsigned supplementary_p : 1;
189 /* True if all the code points are representable by Lisp_Int. */
190 unsigned compact_codes_p : 1;
192 /* True if the charset is unified with Unicode. */
193 unsigned unified_p : 1;
195 /* ISO final byte of the charset: 48..127. It may be -1 if the
196 charset doesn't conform to ISO-2022. */
197 int iso_final;
199 /* ISO revision number of the charset. */
200 int iso_revision;
202 /* If the charset is identical to what supported by Emacs 21 and the
203 priors, the identification number of the charset used in those
204 version. Otherwise, -1. */
205 int emacs_mule_id;
207 /* The method for encoding/decoding characters of the charset. */
208 enum charset_method method;
210 /* Minimum and Maximum code points of the charset. */
211 unsigned min_code, max_code;
213 /* Offset value used by macros CODE_POINT_TO_INDEX and
214 INDEX_TO_CODE_POINT. . */
215 unsigned char_index_offset;
217 /* Minimum and Maximum character codes of the charset. If the
218 charset is compatible with ASCII, min_char is a minimum non-ASCII
219 character of the charset. If the method of charset is
220 CHARSET_METHOD_OFFSET, even if the charset is unified, min_char
221 and max_char doesn't change. */
222 int min_char, max_char;
224 /* The code returned by ENCODE_CHAR if a character is not encodable
225 by the charset. */
226 unsigned invalid_code;
228 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
229 table of bits used to quickly and roughly guess if a character
230 belongs to the charset.
232 The first 64 elements are 512 bits for characters less than
233 0x10000. Each bit corresponds to 128-character block. The last
234 126 elements are 1008 bits for the greater characters
235 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
236 block.
238 If a bit is 1, at least one character in the corresponding block is
239 in this charset. */
240 unsigned char fast_map[190];
242 /* Offset value to calculate a character code from code-point, and
243 visa versa. */
244 int code_offset;
247 /* Hash table of charset symbols vs. the corresponding attribute
248 vectors. */
249 extern Lisp_Object Vcharset_hash_table;
251 /* Table of struct charset. */
252 extern struct charset *charset_table;
254 #define CHARSET_FROM_ID(id) (charset_table + (id))
256 extern Lisp_Object Vcharset_ordered_list;
257 extern Lisp_Object Vcharset_non_preferred_head;
259 /* Incremented everytime we change the priority of charsets. */
260 extern unsigned short charset_ordered_list_tick;
262 extern Lisp_Object Viso_2022_charset_list;
263 extern Lisp_Object Vemacs_mule_charset_list;
265 extern int emacs_mule_charset[256];
267 /* Macros to access information about charset. */
269 /* Return the attribute vector of charset whose symbol is SYMBOL. */
270 #define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
271 Fgethash ((symbol), Vcharset_hash_table, Qnil)
273 #define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
274 #define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
275 #define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
276 #define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
277 #define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
278 #define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
279 #define CHARSET_ATTR_SUBSET(attrs) AREF ((attrs), charset_subset)
280 #define CHARSET_ATTR_SUPERSET(attrs) AREF ((attrs), charset_superset)
281 #define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
282 #define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
284 #define CHARSET_SYMBOL_ID(symbol) \
285 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
287 /* Return an index to Vcharset_hash_table of the charset whose symbol
288 is SYMBOL. */
289 #define CHARSET_SYMBOL_HASH_INDEX(symbol) \
290 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
292 /* Return the attribute vector of CHARSET. */
293 #define CHARSET_ATTRIBUTES(charset) \
294 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
296 #define CHARSET_ID(charset) ((charset)->id)
297 #define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
298 #define CHARSET_DIMENSION(charset) ((charset)->dimension)
299 #define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
300 #define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
301 #define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
302 #define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
303 #define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
304 #define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
305 #define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
306 #define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
307 #define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
308 #define CHARSET_METHOD(charset) ((charset)->method)
309 #define CHARSET_MIN_CODE(charset) ((charset)->min_code)
310 #define CHARSET_MAX_CODE(charset) ((charset)->max_code)
311 #define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
312 #define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
313 #define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
314 #define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
315 #define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
317 #define CHARSET_NAME(charset) \
318 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
319 #define CHARSET_MAP(charset) \
320 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
321 #define CHARSET_DECODER(charset) \
322 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
323 #define CHARSET_ENCODER(charset) \
324 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
325 #define CHARSET_SUBSET(charset) \
326 (CHARSET_ATTR_SUBSET (CHARSET_ATTRIBUTES (charset)))
327 #define CHARSET_SUPERSET(charset) \
328 (CHARSET_ATTR_SUPERSET (CHARSET_ATTRIBUTES (charset)))
329 #define CHARSET_UNIFY_MAP(charset) \
330 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
331 #define CHARSET_DEUNIFIER(charset) \
332 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
334 CHARSET_INLINE void
335 set_charset_attr (struct charset *charset, enum charset_attr_index idx,
336 Lisp_Object val)
338 ASET (CHARSET_ATTRIBUTES (charset), idx, val);
342 /* Nonzero if OBJ is a valid charset symbol. */
343 #define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
345 /* Check if X is a valid charset symbol. If not, signal an error. */
346 #define CHECK_CHARSET(x) \
347 do { \
348 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
349 wrong_type_argument (Qcharsetp, (x)); \
350 } while (0)
353 /* Check if X is a valid charset symbol. If valid, set ID to the id
354 number of the charset. Otherwise, signal an error. */
355 #define CHECK_CHARSET_GET_ID(x, id) \
356 do { \
357 ptrdiff_t idx; \
359 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
360 wrong_type_argument (Qcharsetp, (x)); \
361 id = XINT (AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
362 charset_id)); \
363 } while (0)
366 /* Check if X is a valid charset symbol. If valid, set ATTR to the
367 attr vector of the charset. Otherwise, signal an error. */
368 #define CHECK_CHARSET_GET_ATTR(x, attr) \
369 do { \
370 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
371 wrong_type_argument (Qcharsetp, (x)); \
372 } while (0)
375 #define CHECK_CHARSET_GET_CHARSET(x, charset) \
376 do { \
377 int csid; \
378 CHECK_CHARSET_GET_ID (x, csid); \
379 charset = CHARSET_FROM_ID (csid); \
380 } while (0)
383 /* Lookup Vcharset_ordered_list and return the first charset that
384 contains the character C. */
385 #define CHAR_CHARSET(c) \
386 ((c) < 0x80 ? CHARSET_FROM_ID (charset_ascii) \
387 : char_charset ((c), Qnil, NULL))
389 #if 0
390 /* Char-table of charset-sets. Each element is a bool vector indexed
391 by a charset ID. */
392 extern Lisp_Object Vchar_charset_set;
394 /* Charset-bag of character C. */
395 #define CHAR_CHARSET_SET(c) \
396 CHAR_TABLE_REF (Vchar_charset_set, c)
398 /* Check if two characters C1 and C2 belong to the same charset. */
399 #define SAME_CHARSET_P(c1, c2) \
400 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
402 #endif
405 /* Return a character corresponding to the code-point CODE of CHARSET.
406 Try some optimization before calling decode_char. */
408 #define DECODE_CHAR(charset, code) \
409 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
410 ? (code) \
411 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
412 ? -1 \
413 : (charset)->unified_p \
414 ? decode_char ((charset), (code)) \
415 : (charset)->method == CHARSET_METHOD_OFFSET \
416 ? ((charset)->code_linear_p \
417 ? (int) ((code) - (charset)->min_code) + (charset)->code_offset \
418 : decode_char ((charset), (code))) \
419 : (charset)->method == CHARSET_METHOD_MAP \
420 ? (((charset)->code_linear_p \
421 && VECTORP (CHARSET_DECODER (charset))) \
422 ? XINT (AREF (CHARSET_DECODER (charset), \
423 (code) - (charset)->min_code)) \
424 : decode_char ((charset), (code))) \
425 : decode_char ((charset), (code)))
427 extern Lisp_Object charset_work;
429 /* Return a code point of CHAR in CHARSET.
430 Try some optimization before calling encode_char. */
432 #define ENCODE_CHAR(charset, c) \
433 (verify_expr \
434 (sizeof (c) <= sizeof (int), \
435 (ASCII_CHAR_P (c) && (charset)->ascii_compatible_p \
436 ? (unsigned) (c) \
437 : ((charset)->unified_p \
438 || (charset)->method == CHARSET_METHOD_SUBSET \
439 || (charset)->method == CHARSET_METHOD_SUPERSET) \
440 ? encode_char (charset, c) \
441 : (c) < (charset)->min_char || (c) > (charset)->max_char \
442 ? (charset)->invalid_code \
443 : (charset)->method == CHARSET_METHOD_OFFSET \
444 ? ((charset)->code_linear_p \
445 ? (unsigned) ((c) - (charset)->code_offset) + (charset)->min_code \
446 : encode_char (charset, c)) \
447 : (charset)->method == CHARSET_METHOD_MAP \
448 ? (((charset)->compact_codes_p \
449 && CHAR_TABLE_P (CHARSET_ENCODER (charset))) \
450 ? (charset_work = CHAR_TABLE_REF (CHARSET_ENCODER (charset), c), \
451 (NILP (charset_work) \
452 ? (charset)->invalid_code \
453 : (unsigned) XFASTINT (charset_work))) \
454 : encode_char (charset, c)) \
455 : encode_char (charset, c))))
458 /* Set to 1 when a charset map is loaded to warn that a buffer text
459 and a string data may be relocated. */
460 extern bool charset_map_loaded;
463 /* Set CHARSET to the charset highest priority of C, CODE to the
464 code-point of C in CHARSET. */
465 #define SPLIT_CHAR(c, charset, code) \
466 ((charset) = char_charset ((c), Qnil, &(code)))
469 #define ISO_MAX_DIMENSION 3
470 #define ISO_MAX_CHARS 2
471 #define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
473 /* Mapping table from ISO2022's charset (specified by DIMENSION,
474 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
475 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
476 extern int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
478 /* A charset of type iso2022 who has DIMENSION, CHARS_96, and FINAL
479 (final character). */
480 #define ISO_CHARSET_TABLE(dimension, chars_96, final) \
481 iso_charset_table[(dimension) - 1][chars_96][final]
483 /* Nonzero if the charset who has FAST_MAP may contain C. */
484 #define CHARSET_FAST_MAP_REF(c, fast_map) \
485 ((c) < 0x10000 \
486 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
487 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
489 #define CHARSET_FAST_MAP_SET(c, fast_map) \
490 do { \
491 if ((c) < 0x10000) \
492 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
493 else \
494 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
495 } while (0)
499 /* True if CHARSET may contain the character C. */
500 #define CHAR_CHARSET_P(c, charset) \
501 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
502 || ((CHARSET_UNIFIED_P (charset) \
503 || (charset)->method == CHARSET_METHOD_SUBSET \
504 || (charset)->method == CHARSET_METHOD_SUPERSET) \
505 ? encode_char ((charset), (c)) != (charset)->invalid_code \
506 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
507 && ((charset)->method == CHARSET_METHOD_OFFSET \
508 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
509 : ((charset)->method == CHARSET_METHOD_MAP \
510 && (charset)->compact_codes_p \
511 && CHAR_TABLE_P (CHARSET_ENCODER (charset))) \
512 ? ! NILP (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
513 : encode_char ((charset), (c)) != (charset)->invalid_code))))
516 /* Special macros for emacs-mule encoding. */
518 /* Leading-code followed by extended leading-code. DIMENSION/COLUMN */
519 #define EMACS_MULE_LEADING_CODE_PRIVATE_11 0x9A /* 1/1 */
520 #define EMACS_MULE_LEADING_CODE_PRIVATE_12 0x9B /* 1/2 */
521 #define EMACS_MULE_LEADING_CODE_PRIVATE_21 0x9C /* 2/2 */
522 #define EMACS_MULE_LEADING_CODE_PRIVATE_22 0x9D /* 2/2 */
526 extern Lisp_Object Qcharsetp;
528 extern Lisp_Object Qascii;
529 extern int charset_ascii, charset_eight_bit;
530 extern int charset_unicode;
531 extern int charset_jisx0201_roman;
532 extern int charset_jisx0208_1978;
533 extern int charset_jisx0208;
534 extern int charset_ksc5601;
536 extern int charset_unibyte;
538 extern struct charset *char_charset (int, Lisp_Object, unsigned *);
539 extern Lisp_Object charset_attributes (int);
541 extern int decode_char (struct charset *, unsigned);
542 extern unsigned encode_char (struct charset *, int);
543 extern int string_xstring_p (Lisp_Object);
545 extern void map_charset_chars (void (*) (Lisp_Object, Lisp_Object),
546 Lisp_Object, Lisp_Object,
547 struct charset *, unsigned, unsigned);
549 INLINE_HEADER_END
551 #endif /* EMACS_CHARSET_H */