1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA. */
23 /* At first, see the document in `charset.h' to understand the code in
34 #include <sys/types.h>
38 #include "composite.h"
48 Lisp_Object Qcharset
, Qascii
, Qeight_bit_control
, Qeight_bit_graphic
;
51 /* Declaration of special leading-codes. */
52 EMACS_INT leading_code_private_11
; /* for private DIMENSION1 of 1-column */
53 EMACS_INT leading_code_private_12
; /* for private DIMENSION1 of 2-column */
54 EMACS_INT leading_code_private_21
; /* for private DIMENSION2 of 1-column */
55 EMACS_INT leading_code_private_22
; /* for private DIMENSION2 of 2-column */
57 /* Declaration of special charsets. The values are set by
58 Fsetup_special_charsets. */
59 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
60 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
61 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
62 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
63 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
64 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
65 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
66 int charset_mule_unicode_0100_24ff
;
67 int charset_mule_unicode_2500_33ff
;
68 int charset_mule_unicode_e000_ffff
;
70 Lisp_Object Qcharset_table
;
72 /* A char-table containing information of each character set. */
73 Lisp_Object Vcharset_table
;
75 /* A vector of charset symbol indexed by charset-id. This is used
76 only for returning charset symbol from C functions. */
77 Lisp_Object Vcharset_symbol_table
;
79 /* A list of charset symbols ever defined. */
80 Lisp_Object Vcharset_list
;
82 /* Vector of translation table ever defined.
83 ID of a translation table is used to index this vector. */
84 Lisp_Object Vtranslation_table_vector
;
86 /* A char-table for characters which may invoke auto-filling. */
87 Lisp_Object Vauto_fill_chars
;
89 Lisp_Object Qauto_fill_chars
;
91 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
92 int bytes_by_char_head
[256];
93 int width_by_char_head
[256];
95 /* Mapping table from ISO2022's charset (specified by DIMENSION,
96 CHARS, and FINAL-CHAR) to Emacs' charset. */
97 int iso_charset_table
[2][2][128];
99 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
100 unsigned char *_fetch_multibyte_char_p
;
101 int _fetch_multibyte_char_len
;
103 /* Offset to add to a non-ASCII value when inserting it. */
104 EMACS_INT nonascii_insert_offset
;
106 /* Translation table for converting non-ASCII unibyte characters
107 to multibyte codes, or nil. */
108 Lisp_Object Vnonascii_translation_table
;
110 /* List of all possible generic characters. */
111 Lisp_Object Vgeneric_character_list
;
115 invalid_character (c
)
118 error ("Invalid character: 0%o, %d, 0x%x", c
, c
, c
);
121 /* Parse string STR of length LENGTH and fetch information of a
122 character at STR. Set BYTES to the byte length the character
123 occupies, CHARSET, C1, C2 to proper values of the character. */
125 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
128 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
130 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
131 else if ((bytes) == 2) \
133 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
134 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
136 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
138 else if ((bytes) == 3) \
140 if ((c1) < LEADING_CODE_PRIVATE_11) \
141 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
146 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
149 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
150 Note that this intentionally allows invalid components, such
151 as 0xA0 0xA0, because there exist many files that contain
152 such invalid byte sequences, especially in EUC-GB. */
153 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
154 ((charset) == CHARSET_ASCII \
155 ? ((c1) >= 0 && (c1) <= 0x7F) \
156 : ((charset) == CHARSET_8_BIT_CONTROL \
157 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
158 : ((charset) == CHARSET_8_BIT_GRAPHIC \
159 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
160 : (CHARSET_DIMENSION (charset) == 1 \
161 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
162 : ((c1) >= 0x20 && (c1) <= 0x7F \
163 && (c2) >= 0x20 && (c2) <= 0x7F)))))
165 /* Store multi-byte form of the character C in STR. The caller should
166 allocate at least 4-byte area at STR in advance. Returns the
167 length of the multi-byte form. If C is an invalid character code,
171 char_to_string_1 (c
, str
)
175 unsigned char *p
= str
;
177 if (c
& CHAR_MODIFIER_MASK
) /* This includes the case C is negative. */
179 /* Multibyte character can't have a modifier bit. */
180 if (! SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
183 /* For Meta, Shift, and Control modifiers, we need special care. */
186 /* Move the meta bit to the right place for a string. */
187 c
= (c
& ~CHAR_META
) | 0x80;
191 /* Shift modifier is valid only with [A-Za-z]. */
192 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
194 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
195 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
199 /* Simulate the code in lread.c. */
200 /* Allow `\C- ' and `\C-?'. */
201 if (c
== (CHAR_CTL
| ' '))
203 else if (c
== (CHAR_CTL
| '?'))
205 /* ASCII control chars are made from letters (both cases),
206 as well as the non-letters within 0100...0137. */
207 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
208 c
&= (037 | (~0177 & ~CHAR_CTL
));
209 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
210 c
&= (037 | (~0177 & ~CHAR_CTL
));
213 /* If C still has any modifier bits, just ignore it. */
214 c
&= ~CHAR_MODIFIER_MASK
;
217 if (SINGLE_BYTE_CHAR_P (c
))
219 if (ASCII_BYTE_P (c
) || c
>= 0xA0)
223 *p
++ = LEADING_CODE_8_BIT_CONTROL
;
227 else if (CHAR_VALID_P (c
, 0))
231 SPLIT_CHAR (c
, charset
, c1
, c2
);
233 if (charset
>= LEADING_CODE_EXT_11
)
234 *p
++ = (charset
< LEADING_CODE_EXT_12
235 ? LEADING_CODE_PRIVATE_11
236 : (charset
< LEADING_CODE_EXT_21
237 ? LEADING_CODE_PRIVATE_12
238 : (charset
< LEADING_CODE_EXT_22
239 ? LEADING_CODE_PRIVATE_21
240 : LEADING_CODE_PRIVATE_22
)));
242 if ((c1
> 0 && c1
< 32) || (c2
> 0 && c2
< 32))
258 /* Store multi-byte form of the character C in STR. The caller should
259 allocate at least 4-byte area at STR in advance. Returns the
260 length of the multi-byte form. If C is an invalid character code,
263 Use macro `CHAR_STRING (C, STR)' instead of calling this function
264 directly if C can be an ASCII character. */
267 char_to_string (c
, str
)
272 len
= char_to_string_1 (c
, str
);
274 invalid_character (c
);
279 /* Return the non-ASCII character corresponding to multi-byte form at
280 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
281 length of the multibyte form in *ACTUAL_LEN.
283 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
284 this function directly if you want ot handle ASCII characters as
288 string_to_char (str
, len
, actual_len
)
289 const unsigned char *str
;
290 int len
, *actual_len
;
292 int c
, bytes
, charset
, c1
, c2
;
294 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, charset
, c1
, c2
);
295 c
= MAKE_CHAR (charset
, c1
, c2
);
301 /* Return the length of the multi-byte form at string STR of length LEN.
302 Use the macro MULTIBYTE_FORM_LENGTH instead. */
304 multibyte_form_length (str
, len
)
305 const unsigned char *str
;
310 PARSE_MULTIBYTE_SEQ (str
, len
, bytes
);
314 /* Check multibyte form at string STR of length LEN and set variables
315 pointed by CHARSET, C1, and C2 to charset and position codes of the
316 character at STR, and return 0. If there's no multibyte character,
317 return -1. This should be used only in the macro SPLIT_STRING
318 which checks range of STR in advance. */
321 split_string (str
, len
, charset
, c1
, c2
)
322 const unsigned char *str
;
323 unsigned char *c1
, *c2
;
326 register int bytes
, cs
, code1
, code2
= -1;
328 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, cs
, code1
, code2
);
329 if (cs
== CHARSET_ASCII
)
337 /* Return 1 iff character C has valid printable glyph.
338 Use the macro CHAR_PRINTABLE_P instead. */
345 if (ASCII_BYTE_P (c
))
347 else if (SINGLE_BYTE_CHAR_P (c
))
349 else if (c
>= MAX_CHAR
)
352 SPLIT_CHAR (c
, charset
, c1
, c2
);
353 if (! CHARSET_DEFINED_P (charset
))
355 if (CHARSET_CHARS (charset
) == 94
356 ? c1
<= 32 || c1
>= 127
359 if (CHARSET_DIMENSION (charset
) == 2
360 && (CHARSET_CHARS (charset
) == 94
361 ? c2
<= 32 || c2
>= 127
367 /* Translate character C by translation table TABLE. If C
368 is negative, translate a character specified by CHARSET, C1, and C2
369 (C1 and C2 are code points of the character). If no translation is
370 found in TABLE, return C. */
372 translate_char (table
, c
, charset
, c1
, c2
)
374 int c
, charset
, c1
, c2
;
377 int alt_charset
, alt_c1
, alt_c2
, dimension
;
379 if (c
< 0) c
= MAKE_CHAR (charset
, (c1
& 0x7F) , (c2
& 0x7F));
380 if (!CHAR_TABLE_P (table
)
381 || (ch
= Faref (table
, make_number (c
)), !NATNUMP (ch
)))
384 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
385 dimension
= CHARSET_DIMENSION (alt_charset
);
386 if ((dimension
== 1 && alt_c1
> 0) || (dimension
== 2 && alt_c2
> 0))
387 /* CH is not a generic character, just return it. */
388 return XFASTINT (ch
);
390 /* Since CH is a generic character, we must return a specific
391 charater which has the same position codes as C from CH. */
393 SPLIT_CHAR (c
, charset
, c1
, c2
);
394 if (dimension
!= CHARSET_DIMENSION (charset
))
395 /* We can't make such a character because of dimension mismatch. */
397 return MAKE_CHAR (alt_charset
, c1
, c2
);
400 /* Convert the unibyte character C to multibyte based on
401 Vnonascii_translation_table or nonascii_insert_offset. If they can't
402 convert C to a valid multibyte character, convert it based on
403 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
406 unibyte_char_to_multibyte (c
)
409 if (c
< 0400 && c
>= 0200)
413 if (! NILP (Vnonascii_translation_table
))
415 c
= XINT (Faref (Vnonascii_translation_table
, make_number (c
)));
416 if (c
>= 0400 && ! char_valid_p (c
, 0))
417 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
419 else if (c
>= 0240 && nonascii_insert_offset
> 0)
421 c
+= nonascii_insert_offset
;
422 if (c
< 0400 || ! char_valid_p (c
, 0))
423 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
426 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
432 /* Convert the multibyte character C to unibyte 8-bit character based
433 on Vnonascii_translation_table or nonascii_insert_offset. If
434 REV_TBL is non-nil, it should be a reverse table of
435 Vnonascii_translation_table, i.e. what given by:
436 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
439 multibyte_char_to_unibyte (c
, rev_tbl
)
443 if (!SINGLE_BYTE_CHAR_P (c
))
447 if (! CHAR_TABLE_P (rev_tbl
)
448 && CHAR_TABLE_P (Vnonascii_translation_table
))
449 rev_tbl
= Fchar_table_extra_slot (Vnonascii_translation_table
,
451 if (CHAR_TABLE_P (rev_tbl
))
454 temp
= Faref (rev_tbl
, make_number (c
));
458 c
= (c_save
& 0177) + 0200;
462 if (nonascii_insert_offset
> 0)
463 c
-= nonascii_insert_offset
;
464 if (c
< 128 || c
>= 256)
465 c
= (c_save
& 0177) + 0200;
473 /* Update the table Vcharset_table with the given arguments (see the
474 document of `define-charset' for the meaning of each argument).
475 Several other table contents are also updated. The caller should
476 check the validity of CHARSET-ID and the remaining arguments in
480 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
481 iso_final_char
, iso_graphic_plane
,
482 short_name
, long_name
, description
)
483 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
484 Lisp_Object iso_final_char
, iso_graphic_plane
;
485 Lisp_Object short_name
, long_name
, description
;
487 int charset
= XINT (charset_id
);
489 unsigned char leading_code_base
, leading_code_ext
;
491 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
492 CHARSET_TABLE_ENTRY (charset
)
493 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
495 if (NILP (long_name
))
496 long_name
= short_name
;
497 if (NILP (description
))
498 description
= long_name
;
500 /* Get byte length of multibyte form, base leading-code, and
501 extended leading-code of the charset. See the comment under the
502 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
503 bytes
= XINT (dimension
);
504 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
506 /* Official charset, it doesn't have an extended leading-code. */
507 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
)
508 bytes
+= 1; /* For a base leading-code. */
509 leading_code_base
= charset
;
510 leading_code_ext
= 0;
514 /* Private charset. */
515 bytes
+= 2; /* For base and extended leading-codes. */
517 = (charset
< LEADING_CODE_EXT_12
518 ? LEADING_CODE_PRIVATE_11
519 : (charset
< LEADING_CODE_EXT_21
520 ? LEADING_CODE_PRIVATE_12
521 : (charset
< LEADING_CODE_EXT_22
522 ? LEADING_CODE_PRIVATE_21
523 : LEADING_CODE_PRIVATE_22
)));
524 leading_code_ext
= charset
;
525 if (BYTES_BY_CHAR_HEAD (leading_code_base
) != bytes
)
526 error ("Invalid dimension for the charset-ID %d", charset
);
529 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
530 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
531 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
532 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
533 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
534 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
535 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
536 = make_number (leading_code_base
);
537 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
538 = make_number (leading_code_ext
);
539 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
540 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
542 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
543 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
544 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
545 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
548 /* If we have already defined a charset which has the same
549 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
550 DIRECTION, we must update the entry REVERSE-CHARSET of both
551 charsets. If there's no such charset, the value of the entry
555 for (i
= 0; i
<= MAX_CHARSET
; i
++)
556 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
558 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
559 && CHARSET_CHARS (i
) == XINT (chars
)
560 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
561 && CHARSET_DIRECTION (i
) != XINT (direction
))
563 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
565 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
570 /* No such a charset. */
571 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
575 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
576 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
578 bytes_by_char_head
[leading_code_base
] = bytes
;
579 width_by_char_head
[leading_code_base
] = XINT (width
);
581 /* Update table emacs_code_class. */
582 emacs_code_class
[charset
] = (bytes
== 2
583 ? EMACS_leading_code_2
585 ? EMACS_leading_code_3
586 : EMACS_leading_code_4
));
589 /* Update table iso_charset_table. */
590 if (XINT (iso_final_char
) >= 0
591 && ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
592 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
597 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
600 get_charset_id (charset_symbol
)
601 Lisp_Object charset_symbol
;
606 /* This originally used a ?: operator, but reportedly the HP-UX
607 compiler version HP92453-01 A.10.32.22 miscompiles that. */
608 if (SYMBOLP (charset_symbol
)
609 && VECTORP (val
= Fget (charset_symbol
, Qcharset
))
610 && CHARSET_VALID_P (charset
=
611 XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
])))
617 /* Return an identification number for a new private charset of
618 DIMENSION and WIDTH. If there's no more room for the new charset,
621 get_new_private_charset_id (dimension
, width
)
622 int dimension
, width
;
624 int charset
, from
, to
;
628 from
= LEADING_CODE_EXT_11
;
629 to
= LEADING_CODE_EXT_21
;
633 from
= LEADING_CODE_EXT_21
;
634 to
= LEADING_CODE_EXT_MAX
+ 1;
637 for (charset
= from
; charset
< to
; charset
++)
638 if (!CHARSET_DEFINED_P (charset
)) break;
640 return make_number (charset
< to
? charset
: 0);
643 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
644 doc
: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
645 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
646 treated as a private charset.
647 INFO-VECTOR is a vector of the format:
648 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
649 SHORT-NAME LONG-NAME DESCRIPTION]
650 The meanings of each elements is as follows:
651 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
652 CHARS (integer) is the number of characters in a dimension: 94 or 96.
653 WIDTH (integer) is the number of columns a character in the charset
654 occupies on the screen: one of 0, 1, and 2.
656 DIRECTION (integer) is the rendering direction of characters in the
657 charset when rendering. If 0, render from left to right, else
658 render from right to left.
660 ISO-FINAL-CHAR (character) is the final character of the
661 corresponding ISO 2022 charset.
662 It may be -1 if the charset is internal use only.
664 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
665 while encoding to variants of ISO 2022 coding system, one of the
666 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
667 It may be -1 if the charset is internal use only.
669 SHORT-NAME (string) is the short name to refer to the charset.
671 LONG-NAME (string) is the long name to refer to the charset.
673 DESCRIPTION (string) is the description string of the charset. */)
674 (charset_id
, charset_symbol
, info_vector
)
675 Lisp_Object charset_id
, charset_symbol
, info_vector
;
679 if (!NILP (charset_id
))
680 CHECK_NUMBER (charset_id
);
681 CHECK_SYMBOL (charset_symbol
);
682 CHECK_VECTOR (info_vector
);
684 if (! NILP (charset_id
))
686 if (! CHARSET_VALID_P (XINT (charset_id
)))
687 error ("Invalid CHARSET: %d", XINT (charset_id
));
688 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
689 error ("Already defined charset: %d", XINT (charset_id
));
692 vec
= XVECTOR (info_vector
)->contents
;
693 if (XVECTOR (info_vector
)->size
!= 9
694 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
695 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
696 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
697 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
698 || !INTEGERP (vec
[4])
699 || !(XINT (vec
[4]) == -1 || (XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~'))
700 || !INTEGERP (vec
[5])
701 || !(XINT (vec
[5]) == -1 || XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
704 || !STRINGP (vec
[8]))
705 error ("Invalid info-vector argument for defining charset %s",
706 SDATA (SYMBOL_NAME (charset_symbol
)));
708 if (NILP (charset_id
))
710 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
711 if (XINT (charset_id
) == 0)
712 error ("There's no room for a new private charset %s",
713 SDATA (SYMBOL_NAME (charset_symbol
)));
716 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
717 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
718 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
719 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
720 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
721 Fupdate_coding_systems_internal ();
725 DEFUN ("generic-character-list", Fgeneric_character_list
,
726 Sgeneric_character_list
, 0, 0, 0,
727 doc
: /* Return a list of all possible generic characters.
728 It includes a generic character for a charset not yet defined. */)
731 return Vgeneric_character_list
;
734 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
735 Sget_unused_iso_final_char
, 2, 2, 0,
736 doc
: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
737 DIMENSION is the number of bytes to represent a character: 1 or 2.
738 CHARS is the number of characters in a dimension: 94 or 96.
740 This final char is for private use, thus the range is `0' (48) .. `?' (63).
741 If there's no unused final char for the specified kind of charset,
744 Lisp_Object dimension
, chars
;
748 CHECK_NUMBER (dimension
);
749 CHECK_NUMBER (chars
);
750 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
751 error ("Invalid charset dimension %d, it should be 1 or 2",
753 if (XINT (chars
) != 94 && XINT (chars
) != 96)
754 error ("Invalid charset chars %d, it should be 94 or 96",
756 for (final_char
= '0'; final_char
<= '?'; final_char
++)
758 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
761 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
764 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
766 doc
: /* Declare an equivalent charset for ISO-2022 decoding.
768 On decoding by an ISO-2022 base coding system, when a charset
769 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
770 if CHARSET is designated instead. */)
771 (dimension
, chars
, final_char
, charset
)
772 Lisp_Object dimension
, chars
, final_char
, charset
;
776 CHECK_NUMBER (dimension
);
777 CHECK_NUMBER (chars
);
778 CHECK_NUMBER (final_char
);
779 CHECK_SYMBOL (charset
);
781 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
782 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
783 if (XINT (chars
) != 94 && XINT (chars
) != 96)
784 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
785 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
786 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
787 if ((charset_id
= get_charset_id (charset
)) < 0)
788 error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset
)));
790 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset_id
;
794 /* Return information about charsets in the text at PTR of NBYTES
795 bytes, which are NCHARS characters. The value is:
797 0: Each character is represented by one byte. This is always
798 true for unibyte text.
799 1: No charsets other than ascii eight-bit-control,
800 eight-bit-graphic, and latin-1 are found.
803 In addition, if CHARSETS is nonzero, for each found charset N, set
804 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
805 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
806 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
807 1 (note that there's no charset whose ID is 1). */
810 find_charset_in_text (ptr
, nchars
, nbytes
, charsets
, table
)
811 const unsigned char *ptr
;
812 int nchars
, nbytes
, *charsets
;
815 if (nchars
== nbytes
)
817 if (charsets
&& nbytes
> 0)
819 const unsigned char *endp
= ptr
+ nbytes
;
822 while (ptr
< endp
&& maskbits
!= 7)
824 maskbits
|= (*ptr
< 0x80 ? 1 : *ptr
< 0xA0 ? 2 : 4);
829 charsets
[CHARSET_ASCII
] = 1;
831 charsets
[CHARSET_8_BIT_CONTROL
] = 1;
833 charsets
[CHARSET_8_BIT_GRAPHIC
] = 1;
840 int bytes
, charset
, c1
, c2
;
842 if (! CHAR_TABLE_P (table
))
847 SPLIT_MULTIBYTE_SEQ (ptr
, len
, bytes
, charset
, c1
, c2
);
850 if (!CHARSET_DEFINED_P (charset
))
852 else if (! NILP (table
))
854 int c
= translate_char (table
, -1, charset
, c1
, c2
);
856 charset
= CHAR_CHARSET (c
);
860 && charset
!= CHARSET_ASCII
861 && charset
!= CHARSET_8_BIT_CONTROL
862 && charset
!= CHARSET_8_BIT_GRAPHIC
863 && charset
!= charset_latin_iso8859_1
)
867 charsets
[charset
] = 1;
868 else if (return_val
== 2)
875 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
877 doc
: /* Return a list of charsets in the region between BEG and END.
878 BEG and END are buffer positions.
879 Optional arg TABLE if non-nil is a translation table to look up.
881 If the region contains invalid multibyte characters,
882 `unknown' is included in the returned list.
884 If the current buffer is unibyte, the returned list may contain
885 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
887 Lisp_Object beg
, end
, table
;
889 int charsets
[MAX_CHARSET
+ 1];
890 int from
, from_byte
, to
, stop
, stop_byte
, i
;
893 validate_region (&beg
, &end
);
894 from
= XFASTINT (beg
);
895 stop
= to
= XFASTINT (end
);
897 if (from
< GPT
&& GPT
< to
)
900 stop_byte
= GPT_BYTE
;
903 stop_byte
= CHAR_TO_BYTE (stop
);
905 from_byte
= CHAR_TO_BYTE (from
);
907 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
910 find_charset_in_text (BYTE_POS_ADDR (from_byte
), stop
- from
,
911 stop_byte
- from_byte
, charsets
, table
);
914 from
= stop
, from_byte
= stop_byte
;
915 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
923 val
= Fcons (Qunknown
, val
);
924 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
926 val
= Fcons (CHARSET_SYMBOL (i
), val
);
928 val
= Fcons (Qascii
, val
);
932 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
934 doc
: /* Return a list of charsets in STR.
935 Optional arg TABLE if non-nil is a translation table to look up.
937 If the string contains invalid multibyte characters,
938 `unknown' is included in the returned list.
940 If STR is unibyte, the returned list may contain
941 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
943 Lisp_Object str
, table
;
945 int charsets
[MAX_CHARSET
+ 1];
951 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
952 find_charset_in_text (SDATA (str
), SCHARS (str
),
953 SBYTES (str
), charsets
, table
);
957 val
= Fcons (Qunknown
, val
);
958 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
960 val
= Fcons (CHARSET_SYMBOL (i
), val
);
962 val
= Fcons (Qascii
, val
);
967 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
968 doc
: /* Return a character made from arguments.
969 Internal use only. */)
970 (charset
, code1
, code2
)
971 Lisp_Object charset
, code1
, code2
;
973 int charset_id
, c1
, c2
;
975 CHECK_NUMBER (charset
);
976 charset_id
= XINT (charset
);
977 if (!CHARSET_DEFINED_P (charset_id
))
978 error ("Invalid charset ID: %d", XINT (charset
));
984 CHECK_NUMBER (code1
);
991 CHECK_NUMBER (code2
);
995 if (charset_id
== CHARSET_ASCII
)
997 if (c1
< 0 || c1
> 0x7F)
998 goto invalid_code_posints
;
999 return make_number (c1
);
1001 else if (charset_id
== CHARSET_8_BIT_CONTROL
)
1005 else if (c1
< 0x80 || c1
> 0x9F)
1006 goto invalid_code_posints
;
1007 return make_number (c1
);
1009 else if (charset_id
== CHARSET_8_BIT_GRAPHIC
)
1013 else if (c1
< 0xA0 || c1
> 0xFF)
1014 goto invalid_code_posints
;
1015 return make_number (c1
);
1017 else if (c1
< 0 || c1
> 0xFF || c2
< 0 || c2
> 0xFF)
1018 goto invalid_code_posints
;
1024 ? !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, 0x20)
1025 : !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, c2
)))
1026 goto invalid_code_posints
;
1027 return make_number (MAKE_CHAR (charset_id
, c1
, c2
));
1029 invalid_code_posints
:
1030 error ("Invalid code points for charset ID %d: %d %d", charset_id
, c1
, c2
);
1033 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
1034 doc
: /* Return list of charset and one or two position-codes of CH.
1035 If CH is invalid as a character code,
1036 return a list of symbol `unknown' and CH. */)
1040 int c
, charset
, c1
, c2
;
1044 if (!CHAR_VALID_P (c
, 1))
1045 return Fcons (Qunknown
, Fcons (ch
, Qnil
));
1046 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
1048 ? Fcons (CHARSET_SYMBOL (charset
),
1049 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
1050 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
1053 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
1054 doc
: /* Return charset of CH. */)
1060 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
1063 DEFUN ("charset-after", Fcharset_after
, Scharset_after
, 0, 1, 0,
1064 doc
: /* Return charset of a character in the current buffer at position POS.
1065 If POS is nil, it defauls to the current point.
1066 If POS is out of range, the value is nil. */)
1073 ch
= Fchar_after (pos
);
1074 if (! INTEGERP (ch
))
1076 charset
= CHAR_CHARSET (XINT (ch
));
1077 return CHARSET_SYMBOL (charset
);
1080 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
1081 doc
: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1083 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1084 by their DIMENSION, CHARS, and FINAL-CHAR,
1085 where as Emacs distinguishes them by charset symbol.
1086 See the documentation of the function `charset-info' for the meanings of
1087 DIMENSION, CHARS, and FINAL-CHAR. */)
1088 (dimension
, chars
, final_char
)
1089 Lisp_Object dimension
, chars
, final_char
;
1093 CHECK_NUMBER (dimension
);
1094 CHECK_NUMBER (chars
);
1095 CHECK_NUMBER (final_char
);
1097 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
1099 return CHARSET_SYMBOL (charset
);
1102 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1103 generic character. If GENERICP is zero, return nonzero iff C is a
1104 valid normal character. Do not call this function directly,
1105 instead use macro CHAR_VALID_P. */
1107 char_valid_p (c
, genericp
)
1110 int charset
, c1
, c2
;
1112 if (c
< 0 || c
>= MAX_CHAR
)
1114 if (SINGLE_BYTE_CHAR_P (c
))
1116 SPLIT_CHAR (c
, charset
, c1
, c2
);
1121 if (c2
<= 0) c2
= 0x20;
1125 if (c2
<= 0) c1
= c2
= 0x20;
1128 return (CHARSET_DEFINED_P (charset
)
1129 && CHAR_COMPONENTS_VALID_P (charset
, c1
, c2
));
1132 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
1133 doc
: /* Return t if OBJECT is a valid normal character.
1134 If optional arg GENERICP is non-nil, also return t if OBJECT is
1135 a valid generic character. */)
1137 Lisp_Object object
, genericp
;
1139 if (! NATNUMP (object
))
1141 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
1144 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
1145 Sunibyte_char_to_multibyte
, 1, 1, 0,
1146 doc
: /* Convert the unibyte character CH to multibyte character.
1147 The conversion is done based on `nonascii-translation-table' (which see)
1148 or `nonascii-insert-offset' (which see). */)
1156 if (c
< 0 || c
>= 0400)
1157 error ("Invalid unibyte character: %d", c
);
1158 c
= unibyte_char_to_multibyte (c
);
1160 error ("Can't convert to multibyte character: %d", XINT (ch
));
1161 return make_number (c
);
1164 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
1165 Smultibyte_char_to_unibyte
, 1, 1, 0,
1166 doc
: /* Convert the multibyte character CH to unibyte character.
1167 The conversion is done based on `nonascii-translation-table' (which see)
1168 or `nonascii-insert-offset' (which see). */)
1176 if (! CHAR_VALID_P (c
, 0))
1177 error ("Invalid multibyte character: %d", c
);
1178 c
= multibyte_char_to_unibyte (c
, Qnil
);
1180 error ("Can't convert to unibyte character: %d", XINT (ch
));
1181 return make_number (c
);
1184 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
1185 doc
: /* Return 1 regardless of the argument CH. */)
1190 return make_number (1);
1193 /* Return how many bytes C will occupy in a multibyte buffer.
1194 Don't call this function directly, instead use macro CHAR_BYTES. */
1201 if (ASCII_BYTE_P (c
) || (c
& ~((1 << CHARACTERBITS
) -1)))
1203 if (SINGLE_BYTE_CHAR_P (c
) && c
>= 0xA0)
1206 charset
= CHAR_CHARSET (c
);
1207 return (CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1);
1210 /* Return the width of character of which multi-byte form starts with
1211 C. The width is measured by how many columns occupied on the
1212 screen when displayed in the current buffer. */
1214 #define ONE_BYTE_CHAR_WIDTH(c) \
1217 ? XFASTINT (current_buffer->tab_width) \
1218 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1222 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1223 : ((! NILP (current_buffer->enable_multibyte_characters) \
1224 && BASE_LEADING_CODE_P (c)) \
1225 ? WIDTH_BY_CHAR_HEAD (c) \
1228 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
1229 doc
: /* Return width of CH when displayed in the current buffer.
1230 The width is measured by how many columns it occupies on the screen.
1231 Tab is taken to occupy `tab-width' columns. */)
1235 Lisp_Object val
, disp
;
1237 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1243 /* Get the way the display table would display it. */
1244 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
1247 XSETINT (val
, XVECTOR (disp
)->size
);
1248 else if (SINGLE_BYTE_CHAR_P (c
))
1249 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
1252 int charset
= CHAR_CHARSET (c
);
1254 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
1259 /* Return width of string STR of length LEN when displayed in the
1260 current buffer. The width is measured by how many columns it
1261 occupies on the screen. */
1268 return c_string_width (str
, len
, -1, NULL
, NULL
);
1271 /* Return width of string STR of length LEN when displayed in the
1272 current buffer. The width is measured by how many columns it
1273 occupies on the screen. If PRECISION > 0, return the width of
1274 longest substring that doesn't exceed PRECISION, and set number of
1275 characters and bytes of the substring in *NCHARS and *NBYTES
1279 c_string_width (str
, len
, precision
, nchars
, nbytes
)
1280 const unsigned char *str
;
1281 int len
, precision
, *nchars
, *nbytes
;
1283 int i
= 0, i_byte
= 0;
1286 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1288 while (i_byte
< len
)
1290 int bytes
, thiswidth
;
1295 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1298 val
= DISP_CHAR_VECTOR (dp
, c
);
1300 thiswidth
= XVECTOR (val
)->size
;
1302 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1307 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len
- i_byte
, bytes
);
1308 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1312 && (width
+ thiswidth
> precision
))
1332 /* Return width of Lisp string STRING when displayed in the current
1333 buffer. The width is measured by how many columns it occupies on
1334 the screen while paying attention to compositions. If PRECISION >
1335 0, return the width of longest substring that doesn't exceed
1336 PRECISION, and set number of characters and bytes of the substring
1337 in *NCHARS and *NBYTES respectively. */
1340 lisp_string_width (string
, precision
, nchars
, nbytes
)
1342 int precision
, *nchars
, *nbytes
;
1344 int len
= SCHARS (string
);
1345 int len_byte
= SBYTES (string
);
1346 const unsigned char *str
= SDATA (string
);
1347 int i
= 0, i_byte
= 0;
1349 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1353 int chars
, bytes
, thiswidth
;
1358 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
1359 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
1362 thiswidth
= composition_table
[cmp_id
]->width
;
1364 bytes
= string_char_to_byte (string
, end
) - i_byte
;
1368 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1371 val
= DISP_CHAR_VECTOR (dp
, c
);
1373 thiswidth
= XVECTOR (val
)->size
;
1375 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1380 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len_byte
- i_byte
, bytes
);
1381 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1385 && (width
+ thiswidth
> precision
))
1405 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1406 doc
: /* Return width of STRING when displayed in the current buffer.
1407 Width is measured by how many columns it occupies on the screen.
1408 When calculating width of a multibyte character in STRING,
1409 only the base leading-code is considered; the validity of
1410 the following bytes is not checked. Tabs in STRING are always
1411 taken to occupy `tab-width' columns. */)
1417 CHECK_STRING (string
);
1418 XSETFASTINT (val
, lisp_string_width (string
, -1, NULL
, NULL
));
1422 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1423 doc
: /* Return the direction of CH.
1424 The returned value is 0 for left-to-right and 1 for right-to-left. */)
1431 charset
= CHAR_CHARSET (XFASTINT (ch
));
1432 if (!CHARSET_DEFINED_P (charset
))
1433 invalid_character (XINT (ch
));
1434 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1437 /* Return the number of characters in the NBYTES bytes at PTR.
1438 This works by looking at the contents and checking for multibyte sequences.
1439 However, if the current buffer has enable-multibyte-characters = nil,
1440 we treat each byte as a character. */
1443 chars_in_text (ptr
, nbytes
)
1444 const unsigned char *ptr
;
1447 /* current_buffer is null at early stages of Emacs initialization. */
1448 if (current_buffer
== 0
1449 || NILP (current_buffer
->enable_multibyte_characters
))
1452 return multibyte_chars_in_text (ptr
, nbytes
);
1455 /* Return the number of characters in the NBYTES bytes at PTR.
1456 This works by looking at the contents and checking for multibyte sequences.
1457 It ignores enable-multibyte-characters. */
1460 multibyte_chars_in_text (ptr
, nbytes
)
1461 const unsigned char *ptr
;
1464 const unsigned char *endp
;
1467 endp
= ptr
+ nbytes
;
1472 PARSE_MULTIBYTE_SEQ (ptr
, endp
- ptr
, bytes
);
1480 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1481 count the numbers of characters and bytes in it. On counting
1482 bytes, pay attention to the fact that 8-bit characters in the range
1483 0x80..0x9F are represented by 2 bytes in multibyte text. */
1485 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
1486 const unsigned char *str
;
1487 int len
, *nchars
, *nbytes
;
1489 const unsigned char *endp
= str
+ len
;
1490 int n
, chars
= 0, bytes
= 0;
1494 if (UNIBYTE_STR_AS_MULTIBYTE_P (str
, endp
- str
, n
))
1495 str
+= n
, bytes
+= n
;
1505 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1506 It actually converts only 8-bit characters in the range 0x80..0x9F
1507 that don't contruct multibyte characters to multibyte forms. If
1508 NCHARS is nonzero, set *NCHARS to the number of characters in the
1509 text. It is assured that we can use LEN bytes at STR as a work
1510 area and that is enough. Return the number of bytes of the
1514 str_as_multibyte (str
, len
, nbytes
, nchars
)
1516 int len
, nbytes
, *nchars
;
1518 unsigned char *p
= str
, *endp
= str
+ nbytes
;
1523 while (p
< endp
&& UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1533 safe_bcopy (p
, endp
- nbytes
, nbytes
);
1537 if (UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1544 *to
++ = LEADING_CODE_8_BIT_CONTROL
;
1545 *to
++ = *p
++ + 0x20;
1554 /* Parse unibyte string at STR of LEN bytes, and return the number of
1555 bytes it may ocupy when converted to multibyte string by
1556 `str_to_multibyte'. */
1559 parse_str_to_multibyte (str
, len
)
1563 unsigned char *endp
= str
+ len
;
1566 for (bytes
= 0; str
< endp
; str
++)
1567 bytes
+= (*str
< 0x80 || *str
>= 0xA0) ? 1 : 2;
1571 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1572 that contains the same single-byte characters. It actually
1573 converts all 8-bit characters to multibyte forms. It is assured
1574 that we can use LEN bytes at STR as a work area and that is
1578 str_to_multibyte (str
, len
, bytes
)
1582 unsigned char *p
= str
, *endp
= str
+ bytes
;
1585 while (p
< endp
&& (*p
< 0x80 || *p
>= 0xA0)) p
++;
1591 safe_bcopy (p
, endp
- bytes
, bytes
);
1595 if (*p
< 0x80 || *p
>= 0xA0)
1598 *to
++ = LEADING_CODE_8_BIT_CONTROL
, *to
++ = *p
++ + 0x20;
1603 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1604 actually converts only 8-bit characters in the range 0x80..0x9F to
1608 str_as_unibyte (str
, bytes
)
1612 unsigned char *p
= str
, *endp
= str
+ bytes
;
1613 unsigned char *to
= str
;
1615 while (p
< endp
&& *p
!= LEADING_CODE_8_BIT_CONTROL
) p
++;
1619 if (*p
== LEADING_CODE_8_BIT_CONTROL
)
1620 *to
++ = *(p
+ 1) - 0x20, p
+= 2;
1628 DEFUN ("string", Fstring
, Sstring
, 0, MANY
, 0,
1629 doc
: /* Concatenate all the argument characters and make the result a string.
1630 usage: (string &rest CHARACTERS) */)
1636 unsigned char *buf
, *p
;
1642 bufsize
= MAX_MULTIBYTE_LENGTH
* n
;
1643 SAFE_ALLOCA (buf
, unsigned char *, bufsize
);
1646 for (i
= 0; i
< n
; i
++)
1648 CHECK_NUMBER (args
[i
]);
1649 if (!multibyte
&& !SINGLE_BYTE_CHAR_P (XFASTINT (args
[i
])))
1653 for (i
= 0; i
< n
; i
++)
1657 p
+= CHAR_STRING (c
, p
);
1662 ret
= make_string_from_bytes (buf
, n
, p
- buf
);
1671 charset_id_internal (charset_name
)
1676 val
= Fget (intern (charset_name
), Qcharset
);
1678 error ("Charset %s is not defined", charset_name
);
1680 return (XINT (XVECTOR (val
)->contents
[0]));
1683 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1684 Ssetup_special_charsets
, 0, 0, 0, doc
: /* Internal use only. */)
1687 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1688 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1689 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1690 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1691 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1692 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1693 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1694 charset_mule_unicode_0100_24ff
1695 = charset_id_internal ("mule-unicode-0100-24ff");
1696 charset_mule_unicode_2500_33ff
1697 = charset_id_internal ("mule-unicode-2500-33ff");
1698 charset_mule_unicode_e000_ffff
1699 = charset_id_internal ("mule-unicode-e000-ffff");
1704 init_charset_once ()
1708 staticpro (&Vcharset_table
);
1709 staticpro (&Vcharset_symbol_table
);
1710 staticpro (&Vgeneric_character_list
);
1712 /* This has to be done here, before we call Fmake_char_table. */
1713 Qcharset_table
= intern ("charset-table");
1714 staticpro (&Qcharset_table
);
1716 /* Intern this now in case it isn't already done.
1717 Setting this variable twice is harmless.
1718 But don't staticpro it here--that is done in alloc.c. */
1719 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1721 /* Now we are ready to set up this property, so we can
1722 create the charset table. */
1723 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1724 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1726 Qunknown
= intern ("unknown");
1727 staticpro (&Qunknown
);
1728 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1),
1732 for (i
= 0; i
< 2; i
++)
1733 for (j
= 0; j
< 2; j
++)
1734 for (k
= 0; k
< 128; k
++)
1735 iso_charset_table
[i
][j
][k
] = -1;
1737 for (i
= 0; i
< 256; i
++)
1738 bytes_by_char_head
[i
] = 1;
1739 bytes_by_char_head
[LEADING_CODE_PRIVATE_11
] = 3;
1740 bytes_by_char_head
[LEADING_CODE_PRIVATE_12
] = 3;
1741 bytes_by_char_head
[LEADING_CODE_PRIVATE_21
] = 4;
1742 bytes_by_char_head
[LEADING_CODE_PRIVATE_22
] = 4;
1744 for (i
= 0; i
< 128; i
++)
1745 width_by_char_head
[i
] = 1;
1746 for (; i
< 256; i
++)
1747 width_by_char_head
[i
] = 4;
1748 width_by_char_head
[LEADING_CODE_PRIVATE_11
] = 1;
1749 width_by_char_head
[LEADING_CODE_PRIVATE_12
] = 2;
1750 width_by_char_head
[LEADING_CODE_PRIVATE_21
] = 1;
1751 width_by_char_head
[LEADING_CODE_PRIVATE_22
] = 2;
1757 for (i
= 0x81; i
< 0x90; i
++)
1758 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1759 for (; i
< 0x9A; i
++)
1760 val
= Fcons (make_number ((i
- 0x8F) << 14), val
);
1761 for (i
= 0xA0; i
< 0xF0; i
++)
1762 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1763 for (; i
< 0xFF; i
++)
1764 val
= Fcons (make_number ((i
- 0xE0) << 14), val
);
1765 Vgeneric_character_list
= Fnreverse (val
);
1768 nonascii_insert_offset
= 0;
1769 Vnonascii_translation_table
= Qnil
;
1777 Qcharset
= intern ("charset");
1778 staticpro (&Qcharset
);
1780 Qascii
= intern ("ascii");
1781 staticpro (&Qascii
);
1783 Qeight_bit_control
= intern ("eight-bit-control");
1784 staticpro (&Qeight_bit_control
);
1786 Qeight_bit_graphic
= intern ("eight-bit-graphic");
1787 staticpro (&Qeight_bit_graphic
);
1789 /* Define special charsets ascii, eight-bit-control, and
1790 eight-bit-graphic. */
1791 update_charset_table (make_number (CHARSET_ASCII
),
1792 make_number (1), make_number (94),
1797 build_string ("ASCII"),
1798 Qnil
, /* same as above */
1799 build_string ("ASCII (ISO646 IRV)"));
1800 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1801 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1803 update_charset_table (make_number (CHARSET_8_BIT_CONTROL
),
1804 make_number (1), make_number (96),
1809 build_string ("8-bit control code (0x80..0x9F)"),
1810 Qnil
, /* same as above */
1811 Qnil
); /* same as above */
1812 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL
) = Qeight_bit_control
;
1813 Fput (Qeight_bit_control
, Qcharset
,
1814 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL
));
1816 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC
),
1817 make_number (1), make_number (96),
1822 build_string ("8-bit graphic char (0xA0..0xFF)"),
1823 Qnil
, /* same as above */
1824 Qnil
); /* same as above */
1825 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC
) = Qeight_bit_graphic
;
1826 Fput (Qeight_bit_graphic
, Qcharset
,
1827 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC
));
1829 Qauto_fill_chars
= intern ("auto-fill-chars");
1830 staticpro (&Qauto_fill_chars
);
1831 Fput (Qauto_fill_chars
, Qchar_table_extra_slots
, make_number (0));
1833 defsubr (&Sdefine_charset
);
1834 defsubr (&Sgeneric_character_list
);
1835 defsubr (&Sget_unused_iso_final_char
);
1836 defsubr (&Sdeclare_equiv_charset
);
1837 defsubr (&Sfind_charset_region
);
1838 defsubr (&Sfind_charset_string
);
1839 defsubr (&Smake_char_internal
);
1840 defsubr (&Ssplit_char
);
1841 defsubr (&Schar_charset
);
1842 defsubr (&Scharset_after
);
1843 defsubr (&Siso_charset
);
1844 defsubr (&Schar_valid_p
);
1845 defsubr (&Sunibyte_char_to_multibyte
);
1846 defsubr (&Smultibyte_char_to_unibyte
);
1847 defsubr (&Schar_bytes
);
1848 defsubr (&Schar_width
);
1849 defsubr (&Sstring_width
);
1850 defsubr (&Schar_direction
);
1852 defsubr (&Ssetup_special_charsets
);
1854 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1855 doc
: /* List of charsets ever defined. */);
1856 Vcharset_list
= Fcons (Qascii
, Fcons (Qeight_bit_control
,
1857 Fcons (Qeight_bit_graphic
, Qnil
)));
1859 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
1860 doc
: /* Vector of cons cell of a symbol and translation table ever defined.
1861 An ID of a translation table is an index of this vector. */);
1862 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
1864 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1865 doc
: /* Leading-code of private TYPE9N charset of column-width 1. */);
1866 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1868 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1869 doc
: /* Leading-code of private TYPE9N charset of column-width 2. */);
1870 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1872 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1873 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */);
1874 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1876 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1877 doc
: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */);
1878 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1880 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1881 doc
: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1882 This is used for converting unibyte text to multibyte,
1883 and for inserting character codes specified by number.
1885 This serves to convert a Latin-1 or similar 8-bit character code
1886 to the corresponding Emacs multibyte character code.
1887 Typically the value should be (- (make-char CHARSET 0) 128),
1888 for your choice of character set.
1889 If `nonascii-translation-table' is non-nil, it overrides this variable. */);
1890 nonascii_insert_offset
= 0;
1892 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table
,
1893 doc
: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1894 This is used for converting unibyte text to multibyte,
1895 and for inserting character codes specified by number.
1897 Conversion is performed only when multibyte characters are enabled,
1898 and it serves to convert a Latin-1 or similar 8-bit character code
1899 to the corresponding Emacs character code.
1901 If this is nil, `nonascii-insert-offset' is used instead.
1902 See also the docstring of `make-translation-table'. */);
1903 Vnonascii_translation_table
= Qnil
;
1905 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
1906 doc
: /* A char-table for characters which invoke auto-filling.
1907 Such characters have value t in this table. */);
1908 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
1909 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
1910 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);
1915 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1916 (do not change this comment) */