1 /* Basic multilingual character support.
2 Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1997, 1998, 1999, 2000, 2001
4 National Institute of Advanced Industrial Science and Technology (AIST)
5 Registration Number H14PRO021
7 This file is part of GNU Emacs.
9 GNU Emacs is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
14 GNU Emacs is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with GNU Emacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 Boston, MA 02110-1301, USA. */
24 /* At first, see the document in `charset.h' to understand the code in
35 #include <sys/types.h>
39 #include "composite.h"
49 Lisp_Object Qcharset
, Qascii
, Qeight_bit_control
, Qeight_bit_graphic
;
52 /* Declaration of special leading-codes. */
53 EMACS_INT leading_code_private_11
; /* for private DIMENSION1 of 1-column */
54 EMACS_INT leading_code_private_12
; /* for private DIMENSION1 of 2-column */
55 EMACS_INT leading_code_private_21
; /* for private DIMENSION2 of 1-column */
56 EMACS_INT leading_code_private_22
; /* for private DIMENSION2 of 2-column */
58 /* Declaration of special charsets. The values are set by
59 Fsetup_special_charsets. */
60 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
61 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
62 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
63 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
64 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
65 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
66 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
67 int charset_mule_unicode_0100_24ff
;
68 int charset_mule_unicode_2500_33ff
;
69 int charset_mule_unicode_e000_ffff
;
71 Lisp_Object Qcharset_table
;
73 /* A char-table containing information of each character set. */
74 Lisp_Object Vcharset_table
;
76 /* A vector of charset symbol indexed by charset-id. This is used
77 only for returning charset symbol from C functions. */
78 Lisp_Object Vcharset_symbol_table
;
80 /* A list of charset symbols ever defined. */
81 Lisp_Object Vcharset_list
;
83 /* Vector of translation table ever defined.
84 ID of a translation table is used to index this vector. */
85 Lisp_Object Vtranslation_table_vector
;
87 /* A char-table for characters which may invoke auto-filling. */
88 Lisp_Object Vauto_fill_chars
;
90 Lisp_Object Qauto_fill_chars
;
92 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
93 int bytes_by_char_head
[256];
94 int width_by_char_head
[256];
96 /* Mapping table from ISO2022's charset (specified by DIMENSION,
97 CHARS, and FINAL-CHAR) to Emacs' charset. */
98 int iso_charset_table
[2][2][128];
100 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
101 unsigned char *_fetch_multibyte_char_p
;
102 int _fetch_multibyte_char_len
;
104 /* Offset to add to a non-ASCII value when inserting it. */
105 EMACS_INT nonascii_insert_offset
;
107 /* Translation table for converting non-ASCII unibyte characters
108 to multibyte codes, or nil. */
109 Lisp_Object Vnonascii_translation_table
;
111 /* List of all possible generic characters. */
112 Lisp_Object Vgeneric_character_list
;
116 invalid_character (c
)
119 error ("Invalid character: %d, #o%o, #x%x", c
, c
, c
);
122 /* Parse string STR of length LENGTH and fetch information of a
123 character at STR. Set BYTES to the byte length the character
124 occupies, CHARSET, C1, C2 to proper values of the character. */
126 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
129 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
131 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
132 else if ((bytes) == 2) \
134 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
135 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
137 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
139 else if ((bytes) == 3) \
141 if ((c1) < LEADING_CODE_PRIVATE_11) \
142 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
144 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
147 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
150 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
151 Note that this intentionally allows invalid components, such
152 as 0xA0 0xA0, because there exist many files that contain
153 such invalid byte sequences, especially in EUC-GB. */
154 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
155 ((charset) == CHARSET_ASCII \
156 ? ((c1) >= 0 && (c1) <= 0x7F) \
157 : ((charset) == CHARSET_8_BIT_CONTROL \
158 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
159 : ((charset) == CHARSET_8_BIT_GRAPHIC \
160 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
161 : (CHARSET_DIMENSION (charset) == 1 \
162 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
163 : ((c1) >= 0x20 && (c1) <= 0x7F \
164 && (c2) >= 0x20 && (c2) <= 0x7F)))))
166 /* Store multi-byte form of the character C in STR. The caller should
167 allocate at least 4-byte area at STR in advance. Returns the
168 length of the multi-byte form. If C is an invalid character code,
172 char_to_string_1 (c
, str
)
176 unsigned char *p
= str
;
178 if (c
& CHAR_MODIFIER_MASK
) /* This includes the case C is negative. */
180 /* Multibyte character can't have a modifier bit. */
181 if (! SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
184 /* For Meta, Shift, and Control modifiers, we need special care. */
187 /* Move the meta bit to the right place for a string. */
188 c
= (c
& ~CHAR_META
) | 0x80;
192 /* Shift modifier is valid only with [A-Za-z]. */
193 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
195 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
196 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
200 /* Simulate the code in lread.c. */
201 /* Allow `\C- ' and `\C-?'. */
202 if (c
== (CHAR_CTL
| ' '))
204 else if (c
== (CHAR_CTL
| '?'))
206 /* ASCII control chars are made from letters (both cases),
207 as well as the non-letters within 0100...0137. */
208 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
209 c
&= (037 | (~0177 & ~CHAR_CTL
));
210 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
211 c
&= (037 | (~0177 & ~CHAR_CTL
));
214 /* If C still has any modifier bits, just ignore it. */
215 c
&= ~CHAR_MODIFIER_MASK
;
218 if (SINGLE_BYTE_CHAR_P (c
))
220 if (ASCII_BYTE_P (c
) || c
>= 0xA0)
224 *p
++ = LEADING_CODE_8_BIT_CONTROL
;
228 else if (CHAR_VALID_P (c
, 0))
232 SPLIT_CHAR (c
, charset
, c1
, c2
);
234 if (charset
>= LEADING_CODE_EXT_11
)
235 *p
++ = (charset
< LEADING_CODE_EXT_12
236 ? LEADING_CODE_PRIVATE_11
237 : (charset
< LEADING_CODE_EXT_21
238 ? LEADING_CODE_PRIVATE_12
239 : (charset
< LEADING_CODE_EXT_22
240 ? LEADING_CODE_PRIVATE_21
241 : LEADING_CODE_PRIVATE_22
)));
243 if ((c1
> 0 && c1
< 32) || (c2
> 0 && c2
< 32))
259 /* Store multi-byte form of the character C in STR. The caller should
260 allocate at least 4-byte area at STR in advance. Returns the
261 length of the multi-byte form. If C is an invalid character code,
264 Use macro `CHAR_STRING (C, STR)' instead of calling this function
265 directly if C can be an ASCII character. */
268 char_to_string (c
, str
)
273 len
= char_to_string_1 (c
, str
);
275 invalid_character (c
);
280 /* Return the non-ASCII character corresponding to multi-byte form at
281 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
282 length of the multibyte form in *ACTUAL_LEN.
284 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
285 this function directly if you want ot handle ASCII characters as
289 string_to_char (str
, len
, actual_len
)
290 const unsigned char *str
;
291 int len
, *actual_len
;
293 int c
, bytes
, charset
, c1
, c2
;
295 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, charset
, c1
, c2
);
296 c
= MAKE_CHAR (charset
, c1
, c2
);
302 /* Return the length of the multi-byte form at string STR of length LEN.
303 Use the macro MULTIBYTE_FORM_LENGTH instead. */
305 multibyte_form_length (str
, len
)
306 const unsigned char *str
;
311 PARSE_MULTIBYTE_SEQ (str
, len
, bytes
);
315 /* Check multibyte form at string STR of length LEN and set variables
316 pointed by CHARSET, C1, and C2 to charset and position codes of the
317 character at STR, and return 0. If there's no multibyte character,
318 return -1. This should be used only in the macro SPLIT_STRING
319 which checks range of STR in advance. */
322 split_string (str
, len
, charset
, c1
, c2
)
323 const unsigned char *str
;
324 unsigned char *c1
, *c2
;
327 register int bytes
, cs
, code1
, code2
= -1;
329 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, cs
, code1
, code2
);
330 if (cs
== CHARSET_ASCII
)
338 /* Return 1 iff character C has valid printable glyph.
339 Use the macro CHAR_PRINTABLE_P instead. */
346 if (ASCII_BYTE_P (c
))
348 else if (SINGLE_BYTE_CHAR_P (c
))
350 else if (c
>= MAX_CHAR
)
353 SPLIT_CHAR (c
, charset
, c1
, c2
);
354 if (! CHARSET_DEFINED_P (charset
))
356 if (CHARSET_CHARS (charset
) == 94
357 ? c1
<= 32 || c1
>= 127
360 if (CHARSET_DIMENSION (charset
) == 2
361 && (CHARSET_CHARS (charset
) == 94
362 ? c2
<= 32 || c2
>= 127
368 /* Translate character C by translation table TABLE. If C
369 is negative, translate a character specified by CHARSET, C1, and C2
370 (C1 and C2 are code points of the character). If no translation is
371 found in TABLE, return C. */
373 translate_char (table
, c
, charset
, c1
, c2
)
375 int c
, charset
, c1
, c2
;
378 int alt_charset
, alt_c1
, alt_c2
, dimension
;
380 if (c
< 0) c
= MAKE_CHAR (charset
, (c1
& 0x7F) , (c2
& 0x7F));
381 if (!CHAR_TABLE_P (table
)
382 || (ch
= Faref (table
, make_number (c
)), !NATNUMP (ch
)))
385 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
386 dimension
= CHARSET_DIMENSION (alt_charset
);
387 if ((dimension
== 1 && alt_c1
> 0) || (dimension
== 2 && alt_c2
> 0))
388 /* CH is not a generic character, just return it. */
389 return XFASTINT (ch
);
391 /* Since CH is a generic character, we must return a specific
392 charater which has the same position codes as C from CH. */
394 SPLIT_CHAR (c
, charset
, c1
, c2
);
395 if (dimension
!= CHARSET_DIMENSION (charset
))
396 /* We can't make such a character because of dimension mismatch. */
398 return MAKE_CHAR (alt_charset
, c1
, c2
);
401 /* Convert the unibyte character C to multibyte based on
402 Vnonascii_translation_table or nonascii_insert_offset. If they can't
403 convert C to a valid multibyte character, convert it based on
404 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
407 unibyte_char_to_multibyte (c
)
410 if (c
< 0400 && c
>= 0200)
414 if (! NILP (Vnonascii_translation_table
))
416 c
= XINT (Faref (Vnonascii_translation_table
, make_number (c
)));
417 if (c
>= 0400 && ! char_valid_p (c
, 0))
418 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
420 else if (c
>= 0240 && nonascii_insert_offset
> 0)
422 c
+= nonascii_insert_offset
;
423 if (c
< 0400 || ! char_valid_p (c
, 0))
424 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
427 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
433 /* Convert the multibyte character C to unibyte 8-bit character based
434 on Vnonascii_translation_table or nonascii_insert_offset. If
435 REV_TBL is non-nil, it should be a reverse table of
436 Vnonascii_translation_table, i.e. what given by:
437 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
440 multibyte_char_to_unibyte (c
, rev_tbl
)
444 if (!SINGLE_BYTE_CHAR_P (c
))
448 if (! CHAR_TABLE_P (rev_tbl
)
449 && CHAR_TABLE_P (Vnonascii_translation_table
))
450 rev_tbl
= Fchar_table_extra_slot (Vnonascii_translation_table
,
452 if (CHAR_TABLE_P (rev_tbl
))
455 temp
= Faref (rev_tbl
, make_number (c
));
459 c
= (c_save
& 0177) + 0200;
463 if (nonascii_insert_offset
> 0)
464 c
-= nonascii_insert_offset
;
465 if (c
< 128 || c
>= 256)
466 c
= (c_save
& 0177) + 0200;
474 /* Update the table Vcharset_table with the given arguments (see the
475 document of `define-charset' for the meaning of each argument).
476 Several other table contents are also updated. The caller should
477 check the validity of CHARSET-ID and the remaining arguments in
481 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
482 iso_final_char
, iso_graphic_plane
,
483 short_name
, long_name
, description
)
484 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
485 Lisp_Object iso_final_char
, iso_graphic_plane
;
486 Lisp_Object short_name
, long_name
, description
;
488 int charset
= XINT (charset_id
);
490 unsigned char leading_code_base
, leading_code_ext
;
492 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
493 CHARSET_TABLE_ENTRY (charset
)
494 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
496 if (NILP (long_name
))
497 long_name
= short_name
;
498 if (NILP (description
))
499 description
= long_name
;
501 /* Get byte length of multibyte form, base leading-code, and
502 extended leading-code of the charset. See the comment under the
503 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
504 bytes
= XINT (dimension
);
505 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
507 /* Official charset, it doesn't have an extended leading-code. */
508 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
)
509 bytes
+= 1; /* For a base leading-code. */
510 leading_code_base
= charset
;
511 leading_code_ext
= 0;
515 /* Private charset. */
516 bytes
+= 2; /* For base and extended leading-codes. */
518 = (charset
< LEADING_CODE_EXT_12
519 ? LEADING_CODE_PRIVATE_11
520 : (charset
< LEADING_CODE_EXT_21
521 ? LEADING_CODE_PRIVATE_12
522 : (charset
< LEADING_CODE_EXT_22
523 ? LEADING_CODE_PRIVATE_21
524 : LEADING_CODE_PRIVATE_22
)));
525 leading_code_ext
= charset
;
526 if (BYTES_BY_CHAR_HEAD (leading_code_base
) != bytes
)
527 error ("Invalid dimension for the charset-ID %d", charset
);
530 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
531 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
532 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
533 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
534 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
535 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
536 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
537 = make_number (leading_code_base
);
538 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
539 = make_number (leading_code_ext
);
540 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
541 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
543 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
544 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
545 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
546 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
549 /* If we have already defined a charset which has the same
550 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
551 DIRECTION, we must update the entry REVERSE-CHARSET of both
552 charsets. If there's no such charset, the value of the entry
556 for (i
= 0; i
<= MAX_CHARSET
; i
++)
557 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
559 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
560 && CHARSET_CHARS (i
) == XINT (chars
)
561 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
562 && CHARSET_DIRECTION (i
) != XINT (direction
))
564 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
566 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
571 /* No such a charset. */
572 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
576 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
577 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
579 bytes_by_char_head
[leading_code_base
] = bytes
;
580 width_by_char_head
[leading_code_base
] = XINT (width
);
582 /* Update table emacs_code_class. */
583 emacs_code_class
[charset
] = (bytes
== 2
584 ? EMACS_leading_code_2
586 ? EMACS_leading_code_3
587 : EMACS_leading_code_4
));
590 /* Update table iso_charset_table. */
591 if (XINT (iso_final_char
) >= 0
592 && ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
593 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
598 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
601 get_charset_id (charset_symbol
)
602 Lisp_Object charset_symbol
;
607 /* This originally used a ?: operator, but reportedly the HP-UX
608 compiler version HP92453-01 A.10.32.22 miscompiles that. */
609 if (SYMBOLP (charset_symbol
)
610 && VECTORP (val
= Fget (charset_symbol
, Qcharset
))
611 && CHARSET_VALID_P (charset
=
612 XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
])))
618 /* Return an identification number for a new private charset of
619 DIMENSION and WIDTH. If there's no more room for the new charset,
622 get_new_private_charset_id (dimension
, width
)
623 int dimension
, width
;
625 int charset
, from
, to
;
629 from
= LEADING_CODE_EXT_11
;
630 to
= LEADING_CODE_EXT_21
;
634 from
= LEADING_CODE_EXT_21
;
635 to
= LEADING_CODE_EXT_MAX
+ 1;
638 for (charset
= from
; charset
< to
; charset
++)
639 if (!CHARSET_DEFINED_P (charset
)) break;
641 return make_number (charset
< to
? charset
: 0);
644 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
645 doc
: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
646 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
647 treated as a private charset.
648 INFO-VECTOR is a vector of the format:
649 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
650 SHORT-NAME LONG-NAME DESCRIPTION]
651 The meanings of each elements is as follows:
652 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
653 CHARS (integer) is the number of characters in a dimension: 94 or 96.
654 WIDTH (integer) is the number of columns a character in the charset
655 occupies on the screen: one of 0, 1, and 2.
657 DIRECTION (integer) is the rendering direction of characters in the
658 charset when rendering. If 0, render from left to right, else
659 render from right to left.
661 ISO-FINAL-CHAR (character) is the final character of the
662 corresponding ISO 2022 charset.
663 It may be -1 if the charset is internal use only.
665 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
666 while encoding to variants of ISO 2022 coding system, one of the
667 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
668 It may be -1 if the charset is internal use only.
670 SHORT-NAME (string) is the short name to refer to the charset.
672 LONG-NAME (string) is the long name to refer to the charset.
674 DESCRIPTION (string) is the description string of the charset. */)
675 (charset_id
, charset_symbol
, info_vector
)
676 Lisp_Object charset_id
, charset_symbol
, info_vector
;
680 if (!NILP (charset_id
))
681 CHECK_NUMBER (charset_id
);
682 CHECK_SYMBOL (charset_symbol
);
683 CHECK_VECTOR (info_vector
);
685 if (! NILP (charset_id
))
687 if (! CHARSET_VALID_P (XINT (charset_id
)))
688 error ("Invalid CHARSET: %d", XINT (charset_id
));
689 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
690 error ("Already defined charset: %d", XINT (charset_id
));
693 vec
= XVECTOR (info_vector
)->contents
;
694 if (XVECTOR (info_vector
)->size
!= 9
695 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
696 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
697 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
698 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
699 || !INTEGERP (vec
[4])
700 || !(XINT (vec
[4]) == -1 || (XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~'))
701 || !INTEGERP (vec
[5])
702 || !(XINT (vec
[5]) == -1 || XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
705 || !STRINGP (vec
[8]))
706 error ("Invalid info-vector argument for defining charset %s",
707 SDATA (SYMBOL_NAME (charset_symbol
)));
709 if (NILP (charset_id
))
711 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
712 if (XINT (charset_id
) == 0)
713 error ("There's no room for a new private charset %s",
714 SDATA (SYMBOL_NAME (charset_symbol
)));
717 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
718 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
719 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
720 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
721 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
722 Fupdate_coding_systems_internal ();
726 DEFUN ("generic-character-list", Fgeneric_character_list
,
727 Sgeneric_character_list
, 0, 0, 0,
728 doc
: /* Return a list of all possible generic characters.
729 It includes a generic character for a charset not yet defined. */)
732 return Vgeneric_character_list
;
735 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
736 Sget_unused_iso_final_char
, 2, 2, 0,
737 doc
: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
738 DIMENSION is the number of bytes to represent a character: 1 or 2.
739 CHARS is the number of characters in a dimension: 94 or 96.
741 This final char is for private use, thus the range is `0' (48) .. `?' (63).
742 If there's no unused final char for the specified kind of charset,
745 Lisp_Object dimension
, chars
;
749 CHECK_NUMBER (dimension
);
750 CHECK_NUMBER (chars
);
751 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
752 error ("Invalid charset dimension %d, it should be 1 or 2",
754 if (XINT (chars
) != 94 && XINT (chars
) != 96)
755 error ("Invalid charset chars %d, it should be 94 or 96",
757 for (final_char
= '0'; final_char
<= '?'; final_char
++)
759 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
762 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
765 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
767 doc
: /* Declare an equivalent charset for ISO-2022 decoding.
769 On decoding by an ISO-2022 base coding system, when a charset
770 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
771 if CHARSET is designated instead. */)
772 (dimension
, chars
, final_char
, charset
)
773 Lisp_Object dimension
, chars
, final_char
, charset
;
777 CHECK_NUMBER (dimension
);
778 CHECK_NUMBER (chars
);
779 CHECK_NUMBER (final_char
);
780 CHECK_SYMBOL (charset
);
782 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
783 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
784 if (XINT (chars
) != 94 && XINT (chars
) != 96)
785 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
786 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
787 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
788 if ((charset_id
= get_charset_id (charset
)) < 0)
789 error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset
)));
791 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset_id
;
795 /* Return information about charsets in the text at PTR of NBYTES
796 bytes, which are NCHARS characters. The value is:
798 0: Each character is represented by one byte. This is always
799 true for unibyte text.
800 1: No charsets other than ascii eight-bit-control,
801 eight-bit-graphic, and latin-1 are found.
804 In addition, if CHARSETS is nonzero, for each found charset N, set
805 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
806 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
807 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
808 1 (note that there's no charset whose ID is 1). */
811 find_charset_in_text (ptr
, nchars
, nbytes
, charsets
, table
)
812 const unsigned char *ptr
;
813 int nchars
, nbytes
, *charsets
;
816 if (nchars
== nbytes
)
818 if (charsets
&& nbytes
> 0)
820 const unsigned char *endp
= ptr
+ nbytes
;
823 while (ptr
< endp
&& maskbits
!= 7)
825 maskbits
|= (*ptr
< 0x80 ? 1 : *ptr
< 0xA0 ? 2 : 4);
830 charsets
[CHARSET_ASCII
] = 1;
832 charsets
[CHARSET_8_BIT_CONTROL
] = 1;
834 charsets
[CHARSET_8_BIT_GRAPHIC
] = 1;
841 int bytes
, charset
, c1
, c2
;
843 if (! CHAR_TABLE_P (table
))
848 SPLIT_MULTIBYTE_SEQ (ptr
, len
, bytes
, charset
, c1
, c2
);
851 if (!CHARSET_DEFINED_P (charset
))
853 else if (! NILP (table
))
855 int c
= translate_char (table
, -1, charset
, c1
, c2
);
857 charset
= CHAR_CHARSET (c
);
861 && charset
!= CHARSET_ASCII
862 && charset
!= CHARSET_8_BIT_CONTROL
863 && charset
!= CHARSET_8_BIT_GRAPHIC
864 && charset
!= charset_latin_iso8859_1
)
868 charsets
[charset
] = 1;
869 else if (return_val
== 2)
876 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
878 doc
: /* Return a list of charsets in the region between BEG and END.
879 BEG and END are buffer positions.
880 Optional arg TABLE if non-nil is a translation table to look up.
882 If the region contains invalid multibyte characters,
883 `unknown' is included in the returned list.
885 If the current buffer is unibyte, the returned list may contain
886 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
888 Lisp_Object beg
, end
, table
;
890 int charsets
[MAX_CHARSET
+ 1];
891 int from
, from_byte
, to
, stop
, stop_byte
, i
;
894 validate_region (&beg
, &end
);
895 from
= XFASTINT (beg
);
896 stop
= to
= XFASTINT (end
);
898 if (from
< GPT
&& GPT
< to
)
901 stop_byte
= GPT_BYTE
;
904 stop_byte
= CHAR_TO_BYTE (stop
);
906 from_byte
= CHAR_TO_BYTE (from
);
908 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
911 find_charset_in_text (BYTE_POS_ADDR (from_byte
), stop
- from
,
912 stop_byte
- from_byte
, charsets
, table
);
915 from
= stop
, from_byte
= stop_byte
;
916 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
924 val
= Fcons (Qunknown
, val
);
925 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
927 val
= Fcons (CHARSET_SYMBOL (i
), val
);
929 val
= Fcons (Qascii
, val
);
933 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
935 doc
: /* Return a list of charsets in STR.
936 Optional arg TABLE if non-nil is a translation table to look up.
938 If the string contains invalid multibyte characters,
939 `unknown' is included in the returned list.
941 If STR is unibyte, the returned list may contain
942 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
944 Lisp_Object str
, table
;
946 int charsets
[MAX_CHARSET
+ 1];
952 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
953 find_charset_in_text (SDATA (str
), SCHARS (str
),
954 SBYTES (str
), charsets
, table
);
958 val
= Fcons (Qunknown
, val
);
959 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
961 val
= Fcons (CHARSET_SYMBOL (i
), val
);
963 val
= Fcons (Qascii
, val
);
968 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
969 doc
: /* Return a character made from arguments.
970 Internal use only. */)
971 (charset
, code1
, code2
)
972 Lisp_Object charset
, code1
, code2
;
974 int charset_id
, c1
, c2
;
976 CHECK_NUMBER (charset
);
977 charset_id
= XINT (charset
);
978 if (!CHARSET_DEFINED_P (charset_id
))
979 error ("Invalid charset ID: %d", XINT (charset
));
985 CHECK_NUMBER (code1
);
992 CHECK_NUMBER (code2
);
996 if (charset_id
== CHARSET_ASCII
)
998 if (c1
< 0 || c1
> 0x7F)
999 goto invalid_code_posints
;
1000 return make_number (c1
);
1002 else if (charset_id
== CHARSET_8_BIT_CONTROL
)
1006 else if (c1
< 0x80 || c1
> 0x9F)
1007 goto invalid_code_posints
;
1008 return make_number (c1
);
1010 else if (charset_id
== CHARSET_8_BIT_GRAPHIC
)
1014 else if (c1
< 0xA0 || c1
> 0xFF)
1015 goto invalid_code_posints
;
1016 return make_number (c1
);
1018 else if (c1
< 0 || c1
> 0xFF || c2
< 0 || c2
> 0xFF)
1019 goto invalid_code_posints
;
1025 ? !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, 0x20)
1026 : !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, c2
)))
1027 goto invalid_code_posints
;
1028 return make_number (MAKE_CHAR (charset_id
, c1
, c2
));
1030 invalid_code_posints
:
1031 error ("Invalid code points for charset ID %d: %d %d", charset_id
, c1
, c2
);
1034 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
1035 doc
: /* Return list of charset and one or two position-codes of CH.
1036 If CH is invalid as a character code,
1037 return a list of symbol `unknown' and CH. */)
1041 int c
, charset
, c1
, c2
;
1045 if (!CHAR_VALID_P (c
, 1))
1046 return Fcons (Qunknown
, Fcons (ch
, Qnil
));
1047 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
1049 ? Fcons (CHARSET_SYMBOL (charset
),
1050 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
1051 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
1054 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
1055 doc
: /* Return charset of CH. */)
1061 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
1064 DEFUN ("charset-after", Fcharset_after
, Scharset_after
, 0, 1, 0,
1065 doc
: /* Return charset of a character in the current buffer at position POS.
1066 If POS is nil, it defauls to the current point.
1067 If POS is out of range, the value is nil. */)
1074 ch
= Fchar_after (pos
);
1075 if (! INTEGERP (ch
))
1077 charset
= CHAR_CHARSET (XINT (ch
));
1078 return CHARSET_SYMBOL (charset
);
1081 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
1082 doc
: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1084 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1085 by their DIMENSION, CHARS, and FINAL-CHAR,
1086 where as Emacs distinguishes them by charset symbol.
1087 See the documentation of the function `charset-info' for the meanings of
1088 DIMENSION, CHARS, and FINAL-CHAR. */)
1089 (dimension
, chars
, final_char
)
1090 Lisp_Object dimension
, chars
, final_char
;
1094 CHECK_NUMBER (dimension
);
1095 CHECK_NUMBER (chars
);
1096 CHECK_NUMBER (final_char
);
1098 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
1100 return CHARSET_SYMBOL (charset
);
1103 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1104 generic character. If GENERICP is zero, return nonzero iff C is a
1105 valid normal character. Do not call this function directly,
1106 instead use macro CHAR_VALID_P. */
1108 char_valid_p (c
, genericp
)
1111 int charset
, c1
, c2
;
1113 if (c
< 0 || c
>= MAX_CHAR
)
1115 if (SINGLE_BYTE_CHAR_P (c
))
1117 SPLIT_CHAR (c
, charset
, c1
, c2
);
1122 if (c2
<= 0) c2
= 0x20;
1126 if (c2
<= 0) c1
= c2
= 0x20;
1129 return (CHARSET_DEFINED_P (charset
)
1130 && CHAR_COMPONENTS_VALID_P (charset
, c1
, c2
));
1133 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
1134 doc
: /* Return t if OBJECT is a valid normal character.
1135 If optional arg GENERICP is non-nil, also return t if OBJECT is
1136 a valid generic character. */)
1138 Lisp_Object object
, genericp
;
1140 if (! NATNUMP (object
))
1142 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
1145 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
1146 Sunibyte_char_to_multibyte
, 1, 1, 0,
1147 doc
: /* Convert the unibyte character CH to multibyte character.
1148 The conversion is done based on `nonascii-translation-table' (which see)
1149 or `nonascii-insert-offset' (which see). */)
1157 if (c
< 0 || c
>= 0400)
1158 error ("Invalid unibyte character: %d", c
);
1159 c
= unibyte_char_to_multibyte (c
);
1161 error ("Can't convert to multibyte character: %d", XINT (ch
));
1162 return make_number (c
);
1165 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
1166 Smultibyte_char_to_unibyte
, 1, 1, 0,
1167 doc
: /* Convert the multibyte character CH to unibyte character.
1168 The conversion is done based on `nonascii-translation-table' (which see)
1169 or `nonascii-insert-offset' (which see). */)
1177 if (! CHAR_VALID_P (c
, 0))
1178 error ("Invalid multibyte character: %d", c
);
1179 c
= multibyte_char_to_unibyte (c
, Qnil
);
1181 error ("Can't convert to unibyte character: %d", XINT (ch
));
1182 return make_number (c
);
1185 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
1186 doc
: /* Return 1 regardless of the argument CH. */)
1191 return make_number (1);
1194 /* Return how many bytes C will occupy in a multibyte buffer.
1195 Don't call this function directly, instead use macro CHAR_BYTES. */
1202 if (ASCII_BYTE_P (c
) || (c
& ~((1 << CHARACTERBITS
) -1)))
1204 if (SINGLE_BYTE_CHAR_P (c
) && c
>= 0xA0)
1207 charset
= CHAR_CHARSET (c
);
1208 return (CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1);
1211 /* Return the width of character of which multi-byte form starts with
1212 C. The width is measured by how many columns occupied on the
1213 screen when displayed in the current buffer. */
1215 #define ONE_BYTE_CHAR_WIDTH(c) \
1218 ? XFASTINT (current_buffer->tab_width) \
1219 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1223 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1224 : ((! NILP (current_buffer->enable_multibyte_characters) \
1225 && BASE_LEADING_CODE_P (c)) \
1226 ? WIDTH_BY_CHAR_HEAD (c) \
1229 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
1230 doc
: /* Return width of CH when displayed in the current buffer.
1231 The width is measured by how many columns it occupies on the screen.
1232 Tab is taken to occupy `tab-width' columns. */)
1236 Lisp_Object val
, disp
;
1238 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1244 /* Get the way the display table would display it. */
1245 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
1248 XSETINT (val
, XVECTOR (disp
)->size
);
1249 else if (SINGLE_BYTE_CHAR_P (c
))
1250 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
1253 int charset
= CHAR_CHARSET (c
);
1255 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
1260 /* Return width of string STR of length LEN when displayed in the
1261 current buffer. The width is measured by how many columns it
1262 occupies on the screen. */
1269 return c_string_width (str
, len
, -1, NULL
, NULL
);
1272 /* Return width of string STR of length LEN when displayed in the
1273 current buffer. The width is measured by how many columns it
1274 occupies on the screen. If PRECISION > 0, return the width of
1275 longest substring that doesn't exceed PRECISION, and set number of
1276 characters and bytes of the substring in *NCHARS and *NBYTES
1280 c_string_width (str
, len
, precision
, nchars
, nbytes
)
1281 const unsigned char *str
;
1282 int len
, precision
, *nchars
, *nbytes
;
1284 int i
= 0, i_byte
= 0;
1287 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1289 while (i_byte
< len
)
1291 int bytes
, thiswidth
;
1296 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1299 val
= DISP_CHAR_VECTOR (dp
, c
);
1301 thiswidth
= XVECTOR (val
)->size
;
1303 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1308 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len
- i_byte
, bytes
);
1309 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1313 && (width
+ thiswidth
> precision
))
1333 /* Return width of Lisp string STRING when displayed in the current
1334 buffer. The width is measured by how many columns it occupies on
1335 the screen while paying attention to compositions. If PRECISION >
1336 0, return the width of longest substring that doesn't exceed
1337 PRECISION, and set number of characters and bytes of the substring
1338 in *NCHARS and *NBYTES respectively. */
1341 lisp_string_width (string
, precision
, nchars
, nbytes
)
1343 int precision
, *nchars
, *nbytes
;
1345 int len
= SCHARS (string
);
1346 int len_byte
= SBYTES (string
);
1347 /* This set multibyte to 0 even if STRING is multibyte when it
1348 contains only ascii and eight-bit-graphic, but that's
1350 int multibyte
= len
< len_byte
;
1351 const unsigned char *str
= SDATA (string
);
1352 int i
= 0, i_byte
= 0;
1354 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1358 int chars
, bytes
, thiswidth
;
1363 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
1364 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
1367 thiswidth
= composition_table
[cmp_id
]->width
;
1369 bytes
= string_char_to_byte (string
, end
) - i_byte
;
1376 c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1378 c
= str
[i_byte
], bytes
= 1;
1380 val
= DISP_CHAR_VECTOR (dp
, c
);
1382 thiswidth
= XVECTOR (val
)->size
;
1384 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1390 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len_byte
- i_byte
, bytes
);
1393 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1397 && (width
+ thiswidth
> precision
))
1417 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1418 doc
: /* Return width of STRING when displayed in the current buffer.
1419 Width is measured by how many columns it occupies on the screen.
1420 When calculating width of a multibyte character in STRING,
1421 only the base leading-code is considered; the validity of
1422 the following bytes is not checked. Tabs in STRING are always
1423 taken to occupy `tab-width' columns. */)
1429 CHECK_STRING (string
);
1430 XSETFASTINT (val
, lisp_string_width (string
, -1, NULL
, NULL
));
1434 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1435 doc
: /* Return the direction of CH.
1436 The returned value is 0 for left-to-right and 1 for right-to-left. */)
1443 charset
= CHAR_CHARSET (XFASTINT (ch
));
1444 if (!CHARSET_DEFINED_P (charset
))
1445 invalid_character (XINT (ch
));
1446 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1449 /* Return the number of characters in the NBYTES bytes at PTR.
1450 This works by looking at the contents and checking for multibyte sequences.
1451 However, if the current buffer has enable-multibyte-characters = nil,
1452 we treat each byte as a character. */
1455 chars_in_text (ptr
, nbytes
)
1456 const unsigned char *ptr
;
1459 /* current_buffer is null at early stages of Emacs initialization. */
1460 if (current_buffer
== 0
1461 || NILP (current_buffer
->enable_multibyte_characters
))
1464 return multibyte_chars_in_text (ptr
, nbytes
);
1467 /* Return the number of characters in the NBYTES bytes at PTR.
1468 This works by looking at the contents and checking for multibyte sequences.
1469 It ignores enable-multibyte-characters. */
1472 multibyte_chars_in_text (ptr
, nbytes
)
1473 const unsigned char *ptr
;
1476 const unsigned char *endp
;
1479 endp
= ptr
+ nbytes
;
1484 PARSE_MULTIBYTE_SEQ (ptr
, endp
- ptr
, bytes
);
1492 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1493 count the numbers of characters and bytes in it. On counting
1494 bytes, pay attention to the fact that 8-bit characters in the range
1495 0x80..0x9F are represented by 2 bytes in multibyte text. */
1497 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
1498 const unsigned char *str
;
1499 int len
, *nchars
, *nbytes
;
1501 const unsigned char *endp
= str
+ len
;
1502 int n
, chars
= 0, bytes
= 0;
1506 if (UNIBYTE_STR_AS_MULTIBYTE_P (str
, endp
- str
, n
))
1507 str
+= n
, bytes
+= n
;
1517 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1518 It actually converts only 8-bit characters in the range 0x80..0x9F
1519 that don't contruct multibyte characters to multibyte forms. If
1520 NCHARS is nonzero, set *NCHARS to the number of characters in the
1521 text. It is assured that we can use LEN bytes at STR as a work
1522 area and that is enough. Return the number of bytes of the
1526 str_as_multibyte (str
, len
, nbytes
, nchars
)
1528 int len
, nbytes
, *nchars
;
1530 unsigned char *p
= str
, *endp
= str
+ nbytes
;
1535 while (p
< endp
&& UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1545 safe_bcopy (p
, endp
- nbytes
, nbytes
);
1549 if (UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1556 *to
++ = LEADING_CODE_8_BIT_CONTROL
;
1557 *to
++ = *p
++ + 0x20;
1566 /* Parse unibyte string at STR of LEN bytes, and return the number of
1567 bytes it may ocupy when converted to multibyte string by
1568 `str_to_multibyte'. */
1571 parse_str_to_multibyte (str
, len
)
1575 unsigned char *endp
= str
+ len
;
1578 for (bytes
= 0; str
< endp
; str
++)
1579 bytes
+= (*str
< 0x80 || *str
>= 0xA0) ? 1 : 2;
1583 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1584 that contains the same single-byte characters. It actually
1585 converts all 8-bit characters to multibyte forms. It is assured
1586 that we can use LEN bytes at STR as a work area and that is
1590 str_to_multibyte (str
, len
, bytes
)
1594 unsigned char *p
= str
, *endp
= str
+ bytes
;
1597 while (p
< endp
&& (*p
< 0x80 || *p
>= 0xA0)) p
++;
1603 safe_bcopy (p
, endp
- bytes
, bytes
);
1607 if (*p
< 0x80 || *p
>= 0xA0)
1610 *to
++ = LEADING_CODE_8_BIT_CONTROL
, *to
++ = *p
++ + 0x20;
1615 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1616 actually converts only 8-bit characters in the range 0x80..0x9F to
1620 str_as_unibyte (str
, bytes
)
1624 unsigned char *p
= str
, *endp
= str
+ bytes
;
1625 unsigned char *to
= str
;
1627 while (p
< endp
&& *p
!= LEADING_CODE_8_BIT_CONTROL
) p
++;
1631 if (*p
== LEADING_CODE_8_BIT_CONTROL
)
1632 *to
++ = *(p
+ 1) - 0x20, p
+= 2;
1640 DEFUN ("string", Fstring
, Sstring
, 0, MANY
, 0,
1641 doc
: /* Concatenate all the argument characters and make the result a string.
1642 usage: (string &rest CHARACTERS) */)
1648 unsigned char *buf
, *p
;
1654 bufsize
= MAX_MULTIBYTE_LENGTH
* n
;
1655 SAFE_ALLOCA (buf
, unsigned char *, bufsize
);
1658 for (i
= 0; i
< n
; i
++)
1660 CHECK_NUMBER (args
[i
]);
1661 if (!multibyte
&& !SINGLE_BYTE_CHAR_P (XFASTINT (args
[i
])))
1665 for (i
= 0; i
< n
; i
++)
1669 p
+= CHAR_STRING (c
, p
);
1674 ret
= make_string_from_bytes (buf
, n
, p
- buf
);
1683 charset_id_internal (charset_name
)
1688 val
= Fget (intern (charset_name
), Qcharset
);
1690 error ("Charset %s is not defined", charset_name
);
1692 return (XINT (XVECTOR (val
)->contents
[0]));
1695 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1696 Ssetup_special_charsets
, 0, 0, 0, doc
: /* Internal use only. */)
1699 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1700 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1701 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1702 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1703 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1704 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1705 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1706 charset_mule_unicode_0100_24ff
1707 = charset_id_internal ("mule-unicode-0100-24ff");
1708 charset_mule_unicode_2500_33ff
1709 = charset_id_internal ("mule-unicode-2500-33ff");
1710 charset_mule_unicode_e000_ffff
1711 = charset_id_internal ("mule-unicode-e000-ffff");
1716 init_charset_once ()
1720 staticpro (&Vcharset_table
);
1721 staticpro (&Vcharset_symbol_table
);
1722 staticpro (&Vgeneric_character_list
);
1724 /* This has to be done here, before we call Fmake_char_table. */
1725 Qcharset_table
= intern ("charset-table");
1726 staticpro (&Qcharset_table
);
1728 /* Intern this now in case it isn't already done.
1729 Setting this variable twice is harmless.
1730 But don't staticpro it here--that is done in alloc.c. */
1731 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1733 /* Now we are ready to set up this property, so we can
1734 create the charset table. */
1735 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1736 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1738 Qunknown
= intern ("unknown");
1739 staticpro (&Qunknown
);
1740 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1),
1744 for (i
= 0; i
< 2; i
++)
1745 for (j
= 0; j
< 2; j
++)
1746 for (k
= 0; k
< 128; k
++)
1747 iso_charset_table
[i
][j
][k
] = -1;
1749 for (i
= 0; i
< 256; i
++)
1750 bytes_by_char_head
[i
] = 1;
1751 bytes_by_char_head
[LEADING_CODE_PRIVATE_11
] = 3;
1752 bytes_by_char_head
[LEADING_CODE_PRIVATE_12
] = 3;
1753 bytes_by_char_head
[LEADING_CODE_PRIVATE_21
] = 4;
1754 bytes_by_char_head
[LEADING_CODE_PRIVATE_22
] = 4;
1756 for (i
= 0; i
< 128; i
++)
1757 width_by_char_head
[i
] = 1;
1758 for (; i
< 256; i
++)
1759 width_by_char_head
[i
] = 4;
1760 width_by_char_head
[LEADING_CODE_PRIVATE_11
] = 1;
1761 width_by_char_head
[LEADING_CODE_PRIVATE_12
] = 2;
1762 width_by_char_head
[LEADING_CODE_PRIVATE_21
] = 1;
1763 width_by_char_head
[LEADING_CODE_PRIVATE_22
] = 2;
1769 for (i
= 0x81; i
< 0x90; i
++)
1770 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1771 for (; i
< 0x9A; i
++)
1772 val
= Fcons (make_number ((i
- 0x8F) << 14), val
);
1773 for (i
= 0xA0; i
< 0xF0; i
++)
1774 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1775 for (; i
< 0xFF; i
++)
1776 val
= Fcons (make_number ((i
- 0xE0) << 14), val
);
1777 Vgeneric_character_list
= Fnreverse (val
);
1780 nonascii_insert_offset
= 0;
1781 Vnonascii_translation_table
= Qnil
;
1789 Qcharset
= intern ("charset");
1790 staticpro (&Qcharset
);
1792 Qascii
= intern ("ascii");
1793 staticpro (&Qascii
);
1795 Qeight_bit_control
= intern ("eight-bit-control");
1796 staticpro (&Qeight_bit_control
);
1798 Qeight_bit_graphic
= intern ("eight-bit-graphic");
1799 staticpro (&Qeight_bit_graphic
);
1801 /* Define special charsets ascii, eight-bit-control, and
1802 eight-bit-graphic. */
1803 update_charset_table (make_number (CHARSET_ASCII
),
1804 make_number (1), make_number (94),
1809 build_string ("ASCII"),
1810 Qnil
, /* same as above */
1811 build_string ("ASCII (ISO646 IRV)"));
1812 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1813 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1815 update_charset_table (make_number (CHARSET_8_BIT_CONTROL
),
1816 make_number (1), make_number (96),
1821 build_string ("8-bit control code (0x80..0x9F)"),
1822 Qnil
, /* same as above */
1823 Qnil
); /* same as above */
1824 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL
) = Qeight_bit_control
;
1825 Fput (Qeight_bit_control
, Qcharset
,
1826 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL
));
1828 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC
),
1829 make_number (1), make_number (96),
1834 build_string ("8-bit graphic char (0xA0..0xFF)"),
1835 Qnil
, /* same as above */
1836 Qnil
); /* same as above */
1837 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC
) = Qeight_bit_graphic
;
1838 Fput (Qeight_bit_graphic
, Qcharset
,
1839 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC
));
1841 Qauto_fill_chars
= intern ("auto-fill-chars");
1842 staticpro (&Qauto_fill_chars
);
1843 Fput (Qauto_fill_chars
, Qchar_table_extra_slots
, make_number (0));
1845 defsubr (&Sdefine_charset
);
1846 defsubr (&Sgeneric_character_list
);
1847 defsubr (&Sget_unused_iso_final_char
);
1848 defsubr (&Sdeclare_equiv_charset
);
1849 defsubr (&Sfind_charset_region
);
1850 defsubr (&Sfind_charset_string
);
1851 defsubr (&Smake_char_internal
);
1852 defsubr (&Ssplit_char
);
1853 defsubr (&Schar_charset
);
1854 defsubr (&Scharset_after
);
1855 defsubr (&Siso_charset
);
1856 defsubr (&Schar_valid_p
);
1857 defsubr (&Sunibyte_char_to_multibyte
);
1858 defsubr (&Smultibyte_char_to_unibyte
);
1859 defsubr (&Schar_bytes
);
1860 defsubr (&Schar_width
);
1861 defsubr (&Sstring_width
);
1862 defsubr (&Schar_direction
);
1864 defsubr (&Ssetup_special_charsets
);
1866 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1867 doc
: /* List of charsets ever defined. */);
1868 Vcharset_list
= Fcons (Qascii
, Fcons (Qeight_bit_control
,
1869 Fcons (Qeight_bit_graphic
, Qnil
)));
1871 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
1872 doc
: /* Vector of cons cell of a symbol and translation table ever defined.
1873 An ID of a translation table is an index of this vector. */);
1874 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
1876 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1877 doc
: /* Leading-code of private TYPE9N charset of column-width 1. */);
1878 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1880 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1881 doc
: /* Leading-code of private TYPE9N charset of column-width 2. */);
1882 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1884 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1885 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */);
1886 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1888 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1889 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */);
1890 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1892 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1893 doc
: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1894 This is used for converting unibyte text to multibyte,
1895 and for inserting character codes specified by number.
1897 This serves to convert a Latin-1 or similar 8-bit character code
1898 to the corresponding Emacs multibyte character code.
1899 Typically the value should be (- (make-char CHARSET 0) 128),
1900 for your choice of character set.
1901 If `nonascii-translation-table' is non-nil, it overrides this variable. */);
1902 nonascii_insert_offset
= 0;
1904 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table
,
1905 doc
: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1906 This is used for converting unibyte text to multibyte,
1907 and for inserting character codes specified by number.
1909 Conversion is performed only when multibyte characters are enabled,
1910 and it serves to convert a Latin-1 or similar 8-bit character code
1911 to the corresponding Emacs character code.
1913 If this is nil, `nonascii-insert-offset' is used instead.
1914 See also the docstring of `make-translation-table'. */);
1915 Vnonascii_translation_table
= Qnil
;
1917 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
1918 doc
: /* A char-table for characters which invoke auto-filling.
1919 Such characters have value t in this table. */);
1920 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
1921 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
1922 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);
1927 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1928 (do not change this comment) */