1 /* Basic multilingual character support.
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 1999, 2000, 2001
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 This file is part of GNU Emacs.
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
25 /* At first, see the document in `charset.h' to understand the code in
36 #include <sys/types.h>
40 #include "composite.h"
50 Lisp_Object Qcharset
, Qascii
, Qeight_bit_control
, Qeight_bit_graphic
;
53 /* Declaration of special leading-codes. */
54 EMACS_INT leading_code_private_11
; /* for private DIMENSION1 of 1-column */
55 EMACS_INT leading_code_private_12
; /* for private DIMENSION1 of 2-column */
56 EMACS_INT leading_code_private_21
; /* for private DIMENSION2 of 1-column */
57 EMACS_INT leading_code_private_22
; /* for private DIMENSION2 of 2-column */
59 /* Declaration of special charsets. The values are set by
60 Fsetup_special_charsets. */
61 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
62 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
63 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
64 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
65 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
66 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
67 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
68 int charset_mule_unicode_0100_24ff
;
69 int charset_mule_unicode_2500_33ff
;
70 int charset_mule_unicode_e000_ffff
;
72 Lisp_Object Qcharset_table
;
74 /* A char-table containing information of each character set. */
75 Lisp_Object Vcharset_table
;
77 /* A vector of charset symbol indexed by charset-id. This is used
78 only for returning charset symbol from C functions. */
79 Lisp_Object Vcharset_symbol_table
;
81 /* A list of charset symbols ever defined. */
82 Lisp_Object Vcharset_list
;
84 /* Vector of translation table ever defined.
85 ID of a translation table is used to index this vector. */
86 Lisp_Object Vtranslation_table_vector
;
88 /* A char-table for characters which may invoke auto-filling. */
89 Lisp_Object Vauto_fill_chars
;
91 Lisp_Object Qauto_fill_chars
;
93 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
94 int bytes_by_char_head
[256];
95 int width_by_char_head
[256];
97 /* Mapping table from ISO2022's charset (specified by DIMENSION,
98 CHARS, and FINAL-CHAR) to Emacs' charset. */
99 int iso_charset_table
[2][2][128];
101 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
102 unsigned char *_fetch_multibyte_char_p
;
103 int _fetch_multibyte_char_len
;
105 /* Offset to add to a non-ASCII value when inserting it. */
106 EMACS_INT nonascii_insert_offset
;
108 /* Translation table for converting non-ASCII unibyte characters
109 to multibyte codes, or nil. */
110 Lisp_Object Vnonascii_translation_table
;
112 /* List of all possible generic characters. */
113 Lisp_Object Vgeneric_character_list
;
117 invalid_character (c
)
120 error ("Invalid character: %d, #o%o, #x%x", c
, c
, c
);
123 /* Parse string STR of length LENGTH and fetch information of a
124 character at STR. Set BYTES to the byte length the character
125 occupies, CHARSET, C1, C2 to proper values of the character. */
127 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
130 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
132 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
133 else if ((bytes) == 2) \
135 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
136 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
138 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
140 else if ((bytes) == 3) \
142 if ((c1) < LEADING_CODE_PRIVATE_11) \
143 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
145 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
148 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
151 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
152 Note that this intentionally allows invalid components, such
153 as 0xA0 0xA0, because there exist many files that contain
154 such invalid byte sequences, especially in EUC-GB. */
155 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
156 ((charset) == CHARSET_ASCII \
157 ? ((c1) >= 0 && (c1) <= 0x7F) \
158 : ((charset) == CHARSET_8_BIT_CONTROL \
159 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
160 : ((charset) == CHARSET_8_BIT_GRAPHIC \
161 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
162 : (CHARSET_DIMENSION (charset) == 1 \
163 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
164 : ((c1) >= 0x20 && (c1) <= 0x7F \
165 && (c2) >= 0x20 && (c2) <= 0x7F)))))
167 /* Store multi-byte form of the character C in STR. The caller should
168 allocate at least 4-byte area at STR in advance. Returns the
169 length of the multi-byte form. If C is an invalid character code,
173 char_to_string_1 (c
, str
)
177 unsigned char *p
= str
;
179 if (c
& CHAR_MODIFIER_MASK
) /* This includes the case C is negative. */
181 /* Multibyte character can't have a modifier bit. */
182 if (! SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
185 /* For Meta, Shift, and Control modifiers, we need special care. */
188 /* Move the meta bit to the right place for a string. */
189 c
= (c
& ~CHAR_META
) | 0x80;
193 /* Shift modifier is valid only with [A-Za-z]. */
194 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
196 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
197 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
201 /* Simulate the code in lread.c. */
202 /* Allow `\C- ' and `\C-?'. */
203 if (c
== (CHAR_CTL
| ' '))
205 else if (c
== (CHAR_CTL
| '?'))
207 /* ASCII control chars are made from letters (both cases),
208 as well as the non-letters within 0100...0137. */
209 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
210 c
&= (037 | (~0177 & ~CHAR_CTL
));
211 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
212 c
&= (037 | (~0177 & ~CHAR_CTL
));
215 /* If C still has any modifier bits, just ignore it. */
216 c
&= ~CHAR_MODIFIER_MASK
;
219 if (SINGLE_BYTE_CHAR_P (c
))
221 if (ASCII_BYTE_P (c
) || c
>= 0xA0)
225 *p
++ = LEADING_CODE_8_BIT_CONTROL
;
229 else if (CHAR_VALID_P (c
, 0))
233 SPLIT_CHAR (c
, charset
, c1
, c2
);
235 if (charset
>= LEADING_CODE_EXT_11
)
236 *p
++ = (charset
< LEADING_CODE_EXT_12
237 ? LEADING_CODE_PRIVATE_11
238 : (charset
< LEADING_CODE_EXT_21
239 ? LEADING_CODE_PRIVATE_12
240 : (charset
< LEADING_CODE_EXT_22
241 ? LEADING_CODE_PRIVATE_21
242 : LEADING_CODE_PRIVATE_22
)));
244 if ((c1
> 0 && c1
< 32) || (c2
> 0 && c2
< 32))
260 /* Store multi-byte form of the character C in STR. The caller should
261 allocate at least 4-byte area at STR in advance. Returns the
262 length of the multi-byte form. If C is an invalid character code,
265 Use macro `CHAR_STRING (C, STR)' instead of calling this function
266 directly if C can be an ASCII character. */
269 char_to_string (c
, str
)
274 len
= char_to_string_1 (c
, str
);
276 invalid_character (c
);
281 /* Return the non-ASCII character corresponding to multi-byte form at
282 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
283 length of the multibyte form in *ACTUAL_LEN.
285 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
286 this function directly if you want ot handle ASCII characters as
290 string_to_char (str
, len
, actual_len
)
291 const unsigned char *str
;
292 int len
, *actual_len
;
294 int c
, bytes
, charset
, c1
, c2
;
296 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, charset
, c1
, c2
);
297 c
= MAKE_CHAR (charset
, c1
, c2
);
303 /* Return the length of the multi-byte form at string STR of length LEN.
304 Use the macro MULTIBYTE_FORM_LENGTH instead. */
306 multibyte_form_length (str
, len
)
307 const unsigned char *str
;
312 PARSE_MULTIBYTE_SEQ (str
, len
, bytes
);
316 /* Check multibyte form at string STR of length LEN and set variables
317 pointed by CHARSET, C1, and C2 to charset and position codes of the
318 character at STR, and return 0. If there's no multibyte character,
319 return -1. This should be used only in the macro SPLIT_STRING
320 which checks range of STR in advance. */
323 split_string (str
, len
, charset
, c1
, c2
)
324 const unsigned char *str
;
325 unsigned char *c1
, *c2
;
328 register int bytes
, cs
, code1
, code2
= -1;
330 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, cs
, code1
, code2
);
331 if (cs
== CHARSET_ASCII
)
339 /* Return 1 iff character C has valid printable glyph.
340 Use the macro CHAR_PRINTABLE_P instead. */
347 if (ASCII_BYTE_P (c
))
349 else if (SINGLE_BYTE_CHAR_P (c
))
351 else if (c
>= MAX_CHAR
)
354 SPLIT_CHAR (c
, charset
, c1
, c2
);
355 if (! CHARSET_DEFINED_P (charset
))
357 if (CHARSET_CHARS (charset
) == 94
358 ? c1
<= 32 || c1
>= 127
361 if (CHARSET_DIMENSION (charset
) == 2
362 && (CHARSET_CHARS (charset
) == 94
363 ? c2
<= 32 || c2
>= 127
369 /* Translate character C by translation table TABLE. If C
370 is negative, translate a character specified by CHARSET, C1, and C2
371 (C1 and C2 are code points of the character). If no translation is
372 found in TABLE, return C. */
374 translate_char (table
, c
, charset
, c1
, c2
)
376 int c
, charset
, c1
, c2
;
379 int alt_charset
, alt_c1
, alt_c2
, dimension
;
381 if (c
< 0) c
= MAKE_CHAR (charset
, (c1
& 0x7F) , (c2
& 0x7F));
382 if (!CHAR_TABLE_P (table
)
383 || (ch
= Faref (table
, make_number (c
)), !NATNUMP (ch
)))
386 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
387 dimension
= CHARSET_DIMENSION (alt_charset
);
388 if ((dimension
== 1 && alt_c1
> 0) || (dimension
== 2 && alt_c2
> 0))
389 /* CH is not a generic character, just return it. */
390 return XFASTINT (ch
);
392 /* Since CH is a generic character, we must return a specific
393 charater which has the same position codes as C from CH. */
395 SPLIT_CHAR (c
, charset
, c1
, c2
);
396 if (dimension
!= CHARSET_DIMENSION (charset
))
397 /* We can't make such a character because of dimension mismatch. */
399 return MAKE_CHAR (alt_charset
, c1
, c2
);
402 /* Convert the unibyte character C to multibyte based on
403 Vnonascii_translation_table or nonascii_insert_offset. If they can't
404 convert C to a valid multibyte character, convert it based on
405 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
408 unibyte_char_to_multibyte (c
)
411 if (c
< 0400 && c
>= 0200)
415 if (! NILP (Vnonascii_translation_table
))
417 c
= XINT (Faref (Vnonascii_translation_table
, make_number (c
)));
418 if (c
>= 0400 && ! char_valid_p (c
, 0))
419 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
421 else if (c
>= 0240 && nonascii_insert_offset
> 0)
423 c
+= nonascii_insert_offset
;
424 if (c
< 0400 || ! char_valid_p (c
, 0))
425 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
428 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
434 /* Convert the multibyte character C to unibyte 8-bit character based
435 on Vnonascii_translation_table or nonascii_insert_offset. If
436 REV_TBL is non-nil, it should be a reverse table of
437 Vnonascii_translation_table, i.e. what given by:
438 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
441 multibyte_char_to_unibyte (c
, rev_tbl
)
445 if (!SINGLE_BYTE_CHAR_P (c
))
449 if (! CHAR_TABLE_P (rev_tbl
)
450 && CHAR_TABLE_P (Vnonascii_translation_table
))
451 rev_tbl
= Fchar_table_extra_slot (Vnonascii_translation_table
,
453 if (CHAR_TABLE_P (rev_tbl
))
456 temp
= Faref (rev_tbl
, make_number (c
));
460 c
= (c_save
& 0177) + 0200;
464 if (nonascii_insert_offset
> 0)
465 c
-= nonascii_insert_offset
;
466 if (c
< 128 || c
>= 256)
467 c
= (c_save
& 0177) + 0200;
475 /* Update the table Vcharset_table with the given arguments (see the
476 document of `define-charset' for the meaning of each argument).
477 Several other table contents are also updated. The caller should
478 check the validity of CHARSET-ID and the remaining arguments in
482 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
483 iso_final_char
, iso_graphic_plane
,
484 short_name
, long_name
, description
)
485 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
486 Lisp_Object iso_final_char
, iso_graphic_plane
;
487 Lisp_Object short_name
, long_name
, description
;
489 int charset
= XINT (charset_id
);
491 unsigned char leading_code_base
, leading_code_ext
;
493 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
494 CHARSET_TABLE_ENTRY (charset
)
495 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
497 if (NILP (long_name
))
498 long_name
= short_name
;
499 if (NILP (description
))
500 description
= long_name
;
502 /* Get byte length of multibyte form, base leading-code, and
503 extended leading-code of the charset. See the comment under the
504 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
505 bytes
= XINT (dimension
);
506 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
508 /* Official charset, it doesn't have an extended leading-code. */
509 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
)
510 bytes
+= 1; /* For a base leading-code. */
511 leading_code_base
= charset
;
512 leading_code_ext
= 0;
516 /* Private charset. */
517 bytes
+= 2; /* For base and extended leading-codes. */
519 = (charset
< LEADING_CODE_EXT_12
520 ? LEADING_CODE_PRIVATE_11
521 : (charset
< LEADING_CODE_EXT_21
522 ? LEADING_CODE_PRIVATE_12
523 : (charset
< LEADING_CODE_EXT_22
524 ? LEADING_CODE_PRIVATE_21
525 : LEADING_CODE_PRIVATE_22
)));
526 leading_code_ext
= charset
;
527 if (BYTES_BY_CHAR_HEAD (leading_code_base
) != bytes
)
528 error ("Invalid dimension for the charset-ID %d", charset
);
531 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
532 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
533 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
534 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
535 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
536 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
537 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
538 = make_number (leading_code_base
);
539 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
540 = make_number (leading_code_ext
);
541 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
542 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
544 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
545 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
546 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
547 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
550 /* If we have already defined a charset which has the same
551 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
552 DIRECTION, we must update the entry REVERSE-CHARSET of both
553 charsets. If there's no such charset, the value of the entry
557 for (i
= 0; i
<= MAX_CHARSET
; i
++)
558 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
560 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
561 && CHARSET_CHARS (i
) == XINT (chars
)
562 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
563 && CHARSET_DIRECTION (i
) != XINT (direction
))
565 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
567 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
572 /* No such a charset. */
573 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
577 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
578 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
580 bytes_by_char_head
[leading_code_base
] = bytes
;
581 width_by_char_head
[leading_code_base
] = XINT (width
);
583 /* Update table emacs_code_class. */
584 emacs_code_class
[charset
] = (bytes
== 2
585 ? EMACS_leading_code_2
587 ? EMACS_leading_code_3
588 : EMACS_leading_code_4
));
591 /* Update table iso_charset_table. */
592 if (XINT (iso_final_char
) >= 0
593 && ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
594 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
599 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
602 get_charset_id (charset_symbol
)
603 Lisp_Object charset_symbol
;
608 /* This originally used a ?: operator, but reportedly the HP-UX
609 compiler version HP92453-01 A.10.32.22 miscompiles that. */
610 if (SYMBOLP (charset_symbol
)
611 && VECTORP (val
= Fget (charset_symbol
, Qcharset
))
612 && CHARSET_VALID_P (charset
=
613 XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
])))
619 /* Return an identification number for a new private charset of
620 DIMENSION and WIDTH. If there's no more room for the new charset,
623 get_new_private_charset_id (dimension
, width
)
624 int dimension
, width
;
626 int charset
, from
, to
;
630 from
= LEADING_CODE_EXT_11
;
631 to
= LEADING_CODE_EXT_21
;
635 from
= LEADING_CODE_EXT_21
;
636 to
= LEADING_CODE_EXT_MAX
+ 1;
639 for (charset
= from
; charset
< to
; charset
++)
640 if (!CHARSET_DEFINED_P (charset
)) break;
642 return make_number (charset
< to
? charset
: 0);
645 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
646 doc
: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
647 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
648 treated as a private charset.
649 INFO-VECTOR is a vector of the format:
650 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
651 SHORT-NAME LONG-NAME DESCRIPTION]
652 The meanings of each elements is as follows:
653 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
654 CHARS (integer) is the number of characters in a dimension: 94 or 96.
655 WIDTH (integer) is the number of columns a character in the charset
656 occupies on the screen: one of 0, 1, and 2.
658 DIRECTION (integer) is the rendering direction of characters in the
659 charset when rendering. If 0, render from left to right, else
660 render from right to left.
662 ISO-FINAL-CHAR (character) is the final character of the
663 corresponding ISO 2022 charset.
664 It may be -1 if the charset is internal use only.
666 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
667 while encoding to variants of ISO 2022 coding system, one of the
668 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
669 It may be -1 if the charset is internal use only.
671 SHORT-NAME (string) is the short name to refer to the charset.
673 LONG-NAME (string) is the long name to refer to the charset.
675 DESCRIPTION (string) is the description string of the charset. */)
676 (charset_id
, charset_symbol
, info_vector
)
677 Lisp_Object charset_id
, charset_symbol
, info_vector
;
681 if (!NILP (charset_id
))
682 CHECK_NUMBER (charset_id
);
683 CHECK_SYMBOL (charset_symbol
);
684 CHECK_VECTOR (info_vector
);
686 if (! NILP (charset_id
))
688 if (! CHARSET_VALID_P (XINT (charset_id
)))
689 error ("Invalid CHARSET: %d", XINT (charset_id
));
690 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
691 error ("Already defined charset: %d", XINT (charset_id
));
694 vec
= XVECTOR (info_vector
)->contents
;
695 if (XVECTOR (info_vector
)->size
!= 9
696 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
697 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
698 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
699 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
700 || !INTEGERP (vec
[4])
701 || !(XINT (vec
[4]) == -1 || (XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~'))
702 || !INTEGERP (vec
[5])
703 || !(XINT (vec
[5]) == -1 || XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
706 || !STRINGP (vec
[8]))
707 error ("Invalid info-vector argument for defining charset %s",
708 SDATA (SYMBOL_NAME (charset_symbol
)));
710 if (NILP (charset_id
))
712 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
713 if (XINT (charset_id
) == 0)
714 error ("There's no room for a new private charset %s",
715 SDATA (SYMBOL_NAME (charset_symbol
)));
718 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
719 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
720 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
721 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
722 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
723 Fupdate_coding_systems_internal ();
727 DEFUN ("generic-character-list", Fgeneric_character_list
,
728 Sgeneric_character_list
, 0, 0, 0,
729 doc
: /* Return a list of all possible generic characters.
730 It includes a generic character for a charset not yet defined. */)
733 return Vgeneric_character_list
;
736 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
737 Sget_unused_iso_final_char
, 2, 2, 0,
738 doc
: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
739 DIMENSION is the number of bytes to represent a character: 1 or 2.
740 CHARS is the number of characters in a dimension: 94 or 96.
742 This final char is for private use, thus the range is `0' (48) .. `?' (63).
743 If there's no unused final char for the specified kind of charset,
746 Lisp_Object dimension
, chars
;
750 CHECK_NUMBER (dimension
);
751 CHECK_NUMBER (chars
);
752 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
753 error ("Invalid charset dimension %d, it should be 1 or 2",
755 if (XINT (chars
) != 94 && XINT (chars
) != 96)
756 error ("Invalid charset chars %d, it should be 94 or 96",
758 for (final_char
= '0'; final_char
<= '?'; final_char
++)
760 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
763 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
766 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
768 doc
: /* Declare an equivalent charset for ISO-2022 decoding.
770 On decoding by an ISO-2022 base coding system, when a charset
771 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
772 if CHARSET is designated instead. */)
773 (dimension
, chars
, final_char
, charset
)
774 Lisp_Object dimension
, chars
, final_char
, charset
;
778 CHECK_NUMBER (dimension
);
779 CHECK_NUMBER (chars
);
780 CHECK_NUMBER (final_char
);
781 CHECK_SYMBOL (charset
);
783 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
784 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
785 if (XINT (chars
) != 94 && XINT (chars
) != 96)
786 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
787 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
788 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
789 if ((charset_id
= get_charset_id (charset
)) < 0)
790 error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset
)));
792 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset_id
;
796 /* Return information about charsets in the text at PTR of NBYTES
797 bytes, which are NCHARS characters. The value is:
799 0: Each character is represented by one byte. This is always
800 true for unibyte text.
801 1: No charsets other than ascii eight-bit-control,
802 eight-bit-graphic, and latin-1 are found.
805 In addition, if CHARSETS is nonzero, for each found charset N, set
806 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
807 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
808 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
809 1 (note that there's no charset whose ID is 1). */
812 find_charset_in_text (ptr
, nchars
, nbytes
, charsets
, table
)
813 const unsigned char *ptr
;
814 int nchars
, nbytes
, *charsets
;
817 if (nchars
== nbytes
)
819 if (charsets
&& nbytes
> 0)
821 const unsigned char *endp
= ptr
+ nbytes
;
824 while (ptr
< endp
&& maskbits
!= 7)
826 maskbits
|= (*ptr
< 0x80 ? 1 : *ptr
< 0xA0 ? 2 : 4);
831 charsets
[CHARSET_ASCII
] = 1;
833 charsets
[CHARSET_8_BIT_CONTROL
] = 1;
835 charsets
[CHARSET_8_BIT_GRAPHIC
] = 1;
842 int bytes
, charset
, c1
, c2
;
844 if (! CHAR_TABLE_P (table
))
849 SPLIT_MULTIBYTE_SEQ (ptr
, len
, bytes
, charset
, c1
, c2
);
852 if (!CHARSET_DEFINED_P (charset
))
854 else if (! NILP (table
))
856 int c
= translate_char (table
, -1, charset
, c1
, c2
);
858 charset
= CHAR_CHARSET (c
);
862 && charset
!= CHARSET_ASCII
863 && charset
!= CHARSET_8_BIT_CONTROL
864 && charset
!= CHARSET_8_BIT_GRAPHIC
865 && charset
!= charset_latin_iso8859_1
)
869 charsets
[charset
] = 1;
870 else if (return_val
== 2)
877 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
879 doc
: /* Return a list of charsets in the region between BEG and END.
880 BEG and END are buffer positions.
881 Optional arg TABLE if non-nil is a translation table to look up.
883 If the region contains invalid multibyte characters,
884 `unknown' is included in the returned list.
886 If the current buffer is unibyte, the returned list may contain
887 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
889 Lisp_Object beg
, end
, table
;
891 int charsets
[MAX_CHARSET
+ 1];
892 int from
, from_byte
, to
, stop
, stop_byte
, i
;
895 validate_region (&beg
, &end
);
896 from
= XFASTINT (beg
);
897 stop
= to
= XFASTINT (end
);
899 if (from
< GPT
&& GPT
< to
)
902 stop_byte
= GPT_BYTE
;
905 stop_byte
= CHAR_TO_BYTE (stop
);
907 from_byte
= CHAR_TO_BYTE (from
);
909 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
912 find_charset_in_text (BYTE_POS_ADDR (from_byte
), stop
- from
,
913 stop_byte
- from_byte
, charsets
, table
);
916 from
= stop
, from_byte
= stop_byte
;
917 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
925 val
= Fcons (Qunknown
, val
);
926 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
928 val
= Fcons (CHARSET_SYMBOL (i
), val
);
930 val
= Fcons (Qascii
, val
);
934 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
936 doc
: /* Return a list of charsets in STR.
937 Optional arg TABLE if non-nil is a translation table to look up.
939 If the string contains invalid multibyte characters,
940 `unknown' is included in the returned list.
942 If STR is unibyte, the returned list may contain
943 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
945 Lisp_Object str
, table
;
947 int charsets
[MAX_CHARSET
+ 1];
953 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
954 find_charset_in_text (SDATA (str
), SCHARS (str
),
955 SBYTES (str
), charsets
, table
);
959 val
= Fcons (Qunknown
, val
);
960 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
962 val
= Fcons (CHARSET_SYMBOL (i
), val
);
964 val
= Fcons (Qascii
, val
);
969 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
970 doc
: /* Return a character made from arguments.
971 Internal use only. */)
972 (charset
, code1
, code2
)
973 Lisp_Object charset
, code1
, code2
;
975 int charset_id
, c1
, c2
;
977 CHECK_NUMBER (charset
);
978 charset_id
= XINT (charset
);
979 if (!CHARSET_DEFINED_P (charset_id
))
980 error ("Invalid charset ID: %d", XINT (charset
));
986 CHECK_NUMBER (code1
);
993 CHECK_NUMBER (code2
);
997 if (charset_id
== CHARSET_ASCII
)
999 if (c1
< 0 || c1
> 0x7F)
1000 goto invalid_code_posints
;
1001 return make_number (c1
);
1003 else if (charset_id
== CHARSET_8_BIT_CONTROL
)
1007 else if (c1
< 0x80 || c1
> 0x9F)
1008 goto invalid_code_posints
;
1009 return make_number (c1
);
1011 else if (charset_id
== CHARSET_8_BIT_GRAPHIC
)
1015 else if (c1
< 0xA0 || c1
> 0xFF)
1016 goto invalid_code_posints
;
1017 return make_number (c1
);
1019 else if (c1
< 0 || c1
> 0xFF || c2
< 0 || c2
> 0xFF)
1020 goto invalid_code_posints
;
1026 ? !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, 0x20)
1027 : !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, c2
)))
1028 goto invalid_code_posints
;
1029 return make_number (MAKE_CHAR (charset_id
, c1
, c2
));
1031 invalid_code_posints
:
1032 error ("Invalid code points for charset ID %d: %d %d", charset_id
, c1
, c2
);
1035 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
1036 doc
: /* Return list of charset and one or two position-codes of CH.
1037 If CH is invalid as a character code,
1038 return a list of symbol `unknown' and CH. */)
1042 int c
, charset
, c1
, c2
;
1046 if (!CHAR_VALID_P (c
, 1))
1047 return Fcons (Qunknown
, Fcons (ch
, Qnil
));
1048 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
1050 ? Fcons (CHARSET_SYMBOL (charset
),
1051 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
1052 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
1055 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
1056 doc
: /* Return charset of CH. */)
1062 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
1065 DEFUN ("charset-after", Fcharset_after
, Scharset_after
, 0, 1, 0,
1066 doc
: /* Return charset of a character in the current buffer at position POS.
1067 If POS is nil, it defauls to the current point.
1068 If POS is out of range, the value is nil. */)
1075 ch
= Fchar_after (pos
);
1076 if (! INTEGERP (ch
))
1078 charset
= CHAR_CHARSET (XINT (ch
));
1079 return CHARSET_SYMBOL (charset
);
1082 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
1083 doc
: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1085 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1086 by their DIMENSION, CHARS, and FINAL-CHAR,
1087 where as Emacs distinguishes them by charset symbol.
1088 See the documentation of the function `charset-info' for the meanings of
1089 DIMENSION, CHARS, and FINAL-CHAR. */)
1090 (dimension
, chars
, final_char
)
1091 Lisp_Object dimension
, chars
, final_char
;
1095 CHECK_NUMBER (dimension
);
1096 CHECK_NUMBER (chars
);
1097 CHECK_NUMBER (final_char
);
1099 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
1101 return CHARSET_SYMBOL (charset
);
1104 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1105 generic character. If GENERICP is zero, return nonzero iff C is a
1106 valid normal character. Do not call this function directly,
1107 instead use macro CHAR_VALID_P. */
1109 char_valid_p (c
, genericp
)
1112 int charset
, c1
, c2
;
1114 if (c
< 0 || c
>= MAX_CHAR
)
1116 if (SINGLE_BYTE_CHAR_P (c
))
1118 SPLIT_CHAR (c
, charset
, c1
, c2
);
1123 if (c2
<= 0) c2
= 0x20;
1127 if (c2
<= 0) c1
= c2
= 0x20;
1130 return (CHARSET_DEFINED_P (charset
)
1131 && CHAR_COMPONENTS_VALID_P (charset
, c1
, c2
));
1134 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
1135 doc
: /* Return t if OBJECT is a valid normal character.
1136 If optional arg GENERICP is non-nil, also return t if OBJECT is
1137 a valid generic character. */)
1139 Lisp_Object object
, genericp
;
1141 if (! NATNUMP (object
))
1143 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
1146 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
1147 Sunibyte_char_to_multibyte
, 1, 1, 0,
1148 doc
: /* Convert the unibyte character CH to multibyte character.
1149 The conversion is done based on `nonascii-translation-table' (which see)
1150 or `nonascii-insert-offset' (which see). */)
1158 if (c
< 0 || c
>= 0400)
1159 error ("Invalid unibyte character: %d", c
);
1160 c
= unibyte_char_to_multibyte (c
);
1162 error ("Can't convert to multibyte character: %d", XINT (ch
));
1163 return make_number (c
);
1166 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
1167 Smultibyte_char_to_unibyte
, 1, 1, 0,
1168 doc
: /* Convert the multibyte character CH to unibyte character.
1169 The conversion is done based on `nonascii-translation-table' (which see)
1170 or `nonascii-insert-offset' (which see). */)
1178 if (! CHAR_VALID_P (c
, 0))
1179 error ("Invalid multibyte character: %d", c
);
1180 c
= multibyte_char_to_unibyte (c
, Qnil
);
1182 error ("Can't convert to unibyte character: %d", XINT (ch
));
1183 return make_number (c
);
1186 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
1187 doc
: /* Return 1 regardless of the argument CH. */)
1192 return make_number (1);
1195 /* Return how many bytes C will occupy in a multibyte buffer.
1196 Don't call this function directly, instead use macro CHAR_BYTES. */
1203 if (ASCII_BYTE_P (c
) || (c
& ~((1 << CHARACTERBITS
) -1)))
1205 if (SINGLE_BYTE_CHAR_P (c
) && c
>= 0xA0)
1208 charset
= CHAR_CHARSET (c
);
1209 return (CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1);
1212 /* Return the width of character of which multi-byte form starts with
1213 C. The width is measured by how many columns occupied on the
1214 screen when displayed in the current buffer. */
1216 #define ONE_BYTE_CHAR_WIDTH(c) \
1219 ? XFASTINT (current_buffer->tab_width) \
1220 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1224 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1225 : ((! NILP (current_buffer->enable_multibyte_characters) \
1226 && BASE_LEADING_CODE_P (c)) \
1227 ? WIDTH_BY_CHAR_HEAD (c) \
1230 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
1231 doc
: /* Return width of CH when displayed in the current buffer.
1232 The width is measured by how many columns it occupies on the screen.
1233 Tab is taken to occupy `tab-width' columns. */)
1237 Lisp_Object val
, disp
;
1239 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1245 /* Get the way the display table would display it. */
1246 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
1249 XSETINT (val
, XVECTOR (disp
)->size
);
1250 else if (SINGLE_BYTE_CHAR_P (c
))
1251 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
1254 int charset
= CHAR_CHARSET (c
);
1256 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
1261 /* Return width of string STR of length LEN when displayed in the
1262 current buffer. The width is measured by how many columns it
1263 occupies on the screen. */
1270 return c_string_width (str
, len
, -1, NULL
, NULL
);
1273 /* Return width of string STR of length LEN when displayed in the
1274 current buffer. The width is measured by how many columns it
1275 occupies on the screen. If PRECISION > 0, return the width of
1276 longest substring that doesn't exceed PRECISION, and set number of
1277 characters and bytes of the substring in *NCHARS and *NBYTES
1281 c_string_width (str
, len
, precision
, nchars
, nbytes
)
1282 const unsigned char *str
;
1283 int len
, precision
, *nchars
, *nbytes
;
1285 int i
= 0, i_byte
= 0;
1288 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1290 while (i_byte
< len
)
1292 int bytes
, thiswidth
;
1297 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1300 val
= DISP_CHAR_VECTOR (dp
, c
);
1302 thiswidth
= XVECTOR (val
)->size
;
1304 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1309 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len
- i_byte
, bytes
);
1310 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1314 && (width
+ thiswidth
> precision
))
1334 /* Return width of Lisp string STRING when displayed in the current
1335 buffer. The width is measured by how many columns it occupies on
1336 the screen while paying attention to compositions. If PRECISION >
1337 0, return the width of longest substring that doesn't exceed
1338 PRECISION, and set number of characters and bytes of the substring
1339 in *NCHARS and *NBYTES respectively. */
1342 lisp_string_width (string
, precision
, nchars
, nbytes
)
1344 int precision
, *nchars
, *nbytes
;
1346 int len
= SCHARS (string
);
1347 int len_byte
= SBYTES (string
);
1348 /* This set multibyte to 0 even if STRING is multibyte when it
1349 contains only ascii and eight-bit-graphic, but that's
1351 int multibyte
= len
< len_byte
;
1352 const unsigned char *str
= SDATA (string
);
1353 int i
= 0, i_byte
= 0;
1355 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1359 int chars
, bytes
, thiswidth
;
1364 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
1365 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
1368 thiswidth
= composition_table
[cmp_id
]->width
;
1370 bytes
= string_char_to_byte (string
, end
) - i_byte
;
1377 c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1379 c
= str
[i_byte
], bytes
= 1;
1381 val
= DISP_CHAR_VECTOR (dp
, c
);
1383 thiswidth
= XVECTOR (val
)->size
;
1385 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1391 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len_byte
- i_byte
, bytes
);
1394 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1398 && (width
+ thiswidth
> precision
))
1418 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1419 doc
: /* Return width of STRING when displayed in the current buffer.
1420 Width is measured by how many columns it occupies on the screen.
1421 When calculating width of a multibyte character in STRING,
1422 only the base leading-code is considered; the validity of
1423 the following bytes is not checked. Tabs in STRING are always
1424 taken to occupy `tab-width' columns. */)
1430 CHECK_STRING (string
);
1431 XSETFASTINT (val
, lisp_string_width (string
, -1, NULL
, NULL
));
1435 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1436 doc
: /* Return the direction of CH.
1437 The returned value is 0 for left-to-right and 1 for right-to-left. */)
1444 charset
= CHAR_CHARSET (XFASTINT (ch
));
1445 if (!CHARSET_DEFINED_P (charset
))
1446 invalid_character (XINT (ch
));
1447 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1450 /* Return the number of characters in the NBYTES bytes at PTR.
1451 This works by looking at the contents and checking for multibyte sequences.
1452 However, if the current buffer has enable-multibyte-characters = nil,
1453 we treat each byte as a character. */
1456 chars_in_text (ptr
, nbytes
)
1457 const unsigned char *ptr
;
1460 /* current_buffer is null at early stages of Emacs initialization. */
1461 if (current_buffer
== 0
1462 || NILP (current_buffer
->enable_multibyte_characters
))
1465 return multibyte_chars_in_text (ptr
, nbytes
);
1468 /* Return the number of characters in the NBYTES bytes at PTR.
1469 This works by looking at the contents and checking for multibyte sequences.
1470 It ignores enable-multibyte-characters. */
1473 multibyte_chars_in_text (ptr
, nbytes
)
1474 const unsigned char *ptr
;
1477 const unsigned char *endp
;
1480 endp
= ptr
+ nbytes
;
1485 PARSE_MULTIBYTE_SEQ (ptr
, endp
- ptr
, bytes
);
1493 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1494 count the numbers of characters and bytes in it. On counting
1495 bytes, pay attention to the fact that 8-bit characters in the range
1496 0x80..0x9F are represented by 2 bytes in multibyte text. */
1498 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
1499 const unsigned char *str
;
1500 int len
, *nchars
, *nbytes
;
1502 const unsigned char *endp
= str
+ len
;
1503 int n
, chars
= 0, bytes
= 0;
1507 if (UNIBYTE_STR_AS_MULTIBYTE_P (str
, endp
- str
, n
))
1508 str
+= n
, bytes
+= n
;
1518 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1519 It actually converts only 8-bit characters in the range 0x80..0x9F
1520 that don't contruct multibyte characters to multibyte forms. If
1521 NCHARS is nonzero, set *NCHARS to the number of characters in the
1522 text. It is assured that we can use LEN bytes at STR as a work
1523 area and that is enough. Return the number of bytes of the
1527 str_as_multibyte (str
, len
, nbytes
, nchars
)
1529 int len
, nbytes
, *nchars
;
1531 unsigned char *p
= str
, *endp
= str
+ nbytes
;
1536 while (p
< endp
&& UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1546 safe_bcopy (p
, endp
- nbytes
, nbytes
);
1550 if (UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1557 *to
++ = LEADING_CODE_8_BIT_CONTROL
;
1558 *to
++ = *p
++ + 0x20;
1567 /* Parse unibyte string at STR of LEN bytes, and return the number of
1568 bytes it may ocupy when converted to multibyte string by
1569 `str_to_multibyte'. */
1572 parse_str_to_multibyte (str
, len
)
1576 unsigned char *endp
= str
+ len
;
1579 for (bytes
= 0; str
< endp
; str
++)
1580 bytes
+= (*str
< 0x80 || *str
>= 0xA0) ? 1 : 2;
1584 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1585 that contains the same single-byte characters. It actually
1586 converts all 8-bit characters to multibyte forms. It is assured
1587 that we can use LEN bytes at STR as a work area and that is
1591 str_to_multibyte (str
, len
, bytes
)
1595 unsigned char *p
= str
, *endp
= str
+ bytes
;
1598 while (p
< endp
&& (*p
< 0x80 || *p
>= 0xA0)) p
++;
1604 safe_bcopy (p
, endp
- bytes
, bytes
);
1608 if (*p
< 0x80 || *p
>= 0xA0)
1611 *to
++ = LEADING_CODE_8_BIT_CONTROL
, *to
++ = *p
++ + 0x20;
1616 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1617 actually converts only 8-bit characters in the range 0x80..0x9F to
1621 str_as_unibyte (str
, bytes
)
1625 unsigned char *p
= str
, *endp
= str
+ bytes
;
1626 unsigned char *to
= str
;
1628 while (p
< endp
&& *p
!= LEADING_CODE_8_BIT_CONTROL
) p
++;
1632 if (*p
== LEADING_CODE_8_BIT_CONTROL
)
1633 *to
++ = *(p
+ 1) - 0x20, p
+= 2;
1641 DEFUN ("string", Fstring
, Sstring
, 0, MANY
, 0,
1642 doc
: /* Concatenate all the argument characters and make the result a string.
1643 usage: (string &rest CHARACTERS) */)
1649 unsigned char *buf
, *p
;
1655 bufsize
= MAX_MULTIBYTE_LENGTH
* n
;
1656 SAFE_ALLOCA (buf
, unsigned char *, bufsize
);
1659 for (i
= 0; i
< n
; i
++)
1661 CHECK_NUMBER (args
[i
]);
1662 if (!multibyte
&& !SINGLE_BYTE_CHAR_P (XFASTINT (args
[i
])))
1666 for (i
= 0; i
< n
; i
++)
1670 p
+= CHAR_STRING (c
, p
);
1675 ret
= make_string_from_bytes (buf
, n
, p
- buf
);
1684 charset_id_internal (charset_name
)
1689 val
= Fget (intern (charset_name
), Qcharset
);
1691 error ("Charset %s is not defined", charset_name
);
1693 return (XINT (XVECTOR (val
)->contents
[0]));
1696 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1697 Ssetup_special_charsets
, 0, 0, 0, doc
: /* Internal use only. */)
1700 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1701 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1702 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1703 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1704 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1705 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1706 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1707 charset_mule_unicode_0100_24ff
1708 = charset_id_internal ("mule-unicode-0100-24ff");
1709 charset_mule_unicode_2500_33ff
1710 = charset_id_internal ("mule-unicode-2500-33ff");
1711 charset_mule_unicode_e000_ffff
1712 = charset_id_internal ("mule-unicode-e000-ffff");
1717 init_charset_once ()
1721 staticpro (&Vcharset_table
);
1722 staticpro (&Vcharset_symbol_table
);
1723 staticpro (&Vgeneric_character_list
);
1725 /* This has to be done here, before we call Fmake_char_table. */
1726 Qcharset_table
= intern ("charset-table");
1727 staticpro (&Qcharset_table
);
1729 /* Intern this now in case it isn't already done.
1730 Setting this variable twice is harmless.
1731 But don't staticpro it here--that is done in alloc.c. */
1732 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1734 /* Now we are ready to set up this property, so we can
1735 create the charset table. */
1736 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1737 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1739 Qunknown
= intern ("unknown");
1740 staticpro (&Qunknown
);
1741 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1),
1745 for (i
= 0; i
< 2; i
++)
1746 for (j
= 0; j
< 2; j
++)
1747 for (k
= 0; k
< 128; k
++)
1748 iso_charset_table
[i
][j
][k
] = -1;
1750 for (i
= 0; i
< 256; i
++)
1751 bytes_by_char_head
[i
] = 1;
1752 bytes_by_char_head
[LEADING_CODE_PRIVATE_11
] = 3;
1753 bytes_by_char_head
[LEADING_CODE_PRIVATE_12
] = 3;
1754 bytes_by_char_head
[LEADING_CODE_PRIVATE_21
] = 4;
1755 bytes_by_char_head
[LEADING_CODE_PRIVATE_22
] = 4;
1757 for (i
= 0; i
< 128; i
++)
1758 width_by_char_head
[i
] = 1;
1759 for (; i
< 256; i
++)
1760 width_by_char_head
[i
] = 4;
1761 width_by_char_head
[LEADING_CODE_PRIVATE_11
] = 1;
1762 width_by_char_head
[LEADING_CODE_PRIVATE_12
] = 2;
1763 width_by_char_head
[LEADING_CODE_PRIVATE_21
] = 1;
1764 width_by_char_head
[LEADING_CODE_PRIVATE_22
] = 2;
1770 for (i
= 0x81; i
< 0x90; i
++)
1771 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1772 for (; i
< 0x9A; i
++)
1773 val
= Fcons (make_number ((i
- 0x8F) << 14), val
);
1774 for (i
= 0xA0; i
< 0xF0; i
++)
1775 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1776 for (; i
< 0xFF; i
++)
1777 val
= Fcons (make_number ((i
- 0xE0) << 14), val
);
1778 Vgeneric_character_list
= Fnreverse (val
);
1781 nonascii_insert_offset
= 0;
1782 Vnonascii_translation_table
= Qnil
;
1790 Qcharset
= intern ("charset");
1791 staticpro (&Qcharset
);
1793 Qascii
= intern ("ascii");
1794 staticpro (&Qascii
);
1796 Qeight_bit_control
= intern ("eight-bit-control");
1797 staticpro (&Qeight_bit_control
);
1799 Qeight_bit_graphic
= intern ("eight-bit-graphic");
1800 staticpro (&Qeight_bit_graphic
);
1802 /* Define special charsets ascii, eight-bit-control, and
1803 eight-bit-graphic. */
1804 update_charset_table (make_number (CHARSET_ASCII
),
1805 make_number (1), make_number (94),
1810 build_string ("ASCII"),
1811 Qnil
, /* same as above */
1812 build_string ("ASCII (ISO646 IRV)"));
1813 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1814 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1816 update_charset_table (make_number (CHARSET_8_BIT_CONTROL
),
1817 make_number (1), make_number (96),
1822 build_string ("8-bit control code (0x80..0x9F)"),
1823 Qnil
, /* same as above */
1824 Qnil
); /* same as above */
1825 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL
) = Qeight_bit_control
;
1826 Fput (Qeight_bit_control
, Qcharset
,
1827 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL
));
1829 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC
),
1830 make_number (1), make_number (96),
1835 build_string ("8-bit graphic char (0xA0..0xFF)"),
1836 Qnil
, /* same as above */
1837 Qnil
); /* same as above */
1838 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC
) = Qeight_bit_graphic
;
1839 Fput (Qeight_bit_graphic
, Qcharset
,
1840 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC
));
1842 Qauto_fill_chars
= intern ("auto-fill-chars");
1843 staticpro (&Qauto_fill_chars
);
1844 Fput (Qauto_fill_chars
, Qchar_table_extra_slots
, make_number (0));
1846 defsubr (&Sdefine_charset
);
1847 defsubr (&Sgeneric_character_list
);
1848 defsubr (&Sget_unused_iso_final_char
);
1849 defsubr (&Sdeclare_equiv_charset
);
1850 defsubr (&Sfind_charset_region
);
1851 defsubr (&Sfind_charset_string
);
1852 defsubr (&Smake_char_internal
);
1853 defsubr (&Ssplit_char
);
1854 defsubr (&Schar_charset
);
1855 defsubr (&Scharset_after
);
1856 defsubr (&Siso_charset
);
1857 defsubr (&Schar_valid_p
);
1858 defsubr (&Sunibyte_char_to_multibyte
);
1859 defsubr (&Smultibyte_char_to_unibyte
);
1860 defsubr (&Schar_bytes
);
1861 defsubr (&Schar_width
);
1862 defsubr (&Sstring_width
);
1863 defsubr (&Schar_direction
);
1865 defsubr (&Ssetup_special_charsets
);
1867 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1868 doc
: /* List of charsets ever defined. */);
1869 Vcharset_list
= Fcons (Qascii
, Fcons (Qeight_bit_control
,
1870 Fcons (Qeight_bit_graphic
, Qnil
)));
1872 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
1873 doc
: /* Vector of cons cell of a symbol and translation table ever defined.
1874 An ID of a translation table is an index of this vector. */);
1875 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
1877 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1878 doc
: /* Leading-code of private TYPE9N charset of column-width 1. */);
1879 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1881 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1882 doc
: /* Leading-code of private TYPE9N charset of column-width 2. */);
1883 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1885 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1886 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */);
1887 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1889 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1890 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */);
1891 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1893 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1894 doc
: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1895 This is used for converting unibyte text to multibyte,
1896 and for inserting character codes specified by number.
1898 This serves to convert a Latin-1 or similar 8-bit character code
1899 to the corresponding Emacs multibyte character code.
1900 Typically the value should be (- (make-char CHARSET 0) 128),
1901 for your choice of character set.
1902 If `nonascii-translation-table' is non-nil, it overrides this variable. */);
1903 nonascii_insert_offset
= 0;
1905 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table
,
1906 doc
: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1907 This is used for converting unibyte text to multibyte,
1908 and for inserting character codes specified by number.
1910 Conversion is performed only when multibyte characters are enabled,
1911 and it serves to convert a Latin-1 or similar 8-bit character code
1912 to the corresponding Emacs character code.
1914 If this is nil, `nonascii-insert-offset' is used instead.
1915 See also the docstring of `make-translation-table'. */);
1916 Vnonascii_translation_table
= Qnil
;
1918 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
1919 doc
: /* A char-table for characters which invoke auto-filling.
1920 Such characters have value t in this table. */);
1921 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
1922 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
1923 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);
1928 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1929 (do not change this comment) */