1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* At first, see the document in `charset.h' to understand the code in
34 #include <sys/types.h>
38 #include "composite.h"
48 Lisp_Object Qcharset
, Qascii
, Qeight_bit_control
, Qeight_bit_graphic
;
51 /* Declaration of special leading-codes. */
52 int leading_code_private_11
; /* for private DIMENSION1 of 1-column */
53 int leading_code_private_12
; /* for private DIMENSION1 of 2-column */
54 int leading_code_private_21
; /* for private DIMENSION2 of 1-column */
55 int leading_code_private_22
; /* for private DIMENSION2 of 2-column */
57 /* Declaration of special charsets. The values are set by
58 Fsetup_special_charsets. */
59 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
60 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
61 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
62 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
63 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
64 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
65 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
67 Lisp_Object Qcharset_table
;
69 /* A char-table containing information of each character set. */
70 Lisp_Object Vcharset_table
;
72 /* A vector of charset symbol indexed by charset-id. This is used
73 only for returning charset symbol from C functions. */
74 Lisp_Object Vcharset_symbol_table
;
76 /* A list of charset symbols ever defined. */
77 Lisp_Object Vcharset_list
;
79 /* Vector of translation table ever defined.
80 ID of a translation table is used to index this vector. */
81 Lisp_Object Vtranslation_table_vector
;
83 /* A char-table for characters which may invoke auto-filling. */
84 Lisp_Object Vauto_fill_chars
;
86 Lisp_Object Qauto_fill_chars
;
88 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
89 int bytes_by_char_head
[256];
90 int width_by_char_head
[256];
92 /* Mapping table from ISO2022's charset (specified by DIMENSION,
93 CHARS, and FINAL-CHAR) to Emacs' charset. */
94 int iso_charset_table
[2][2][128];
96 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
97 unsigned char *_fetch_multibyte_char_p
;
98 int _fetch_multibyte_char_len
;
100 /* Offset to add to a non-ASCII value when inserting it. */
101 int nonascii_insert_offset
;
103 /* Translation table for converting non-ASCII unibyte characters
104 to multibyte codes, or nil. */
105 Lisp_Object Vnonascii_translation_table
;
107 /* List of all possible generic characters. */
108 Lisp_Object Vgeneric_character_list
;
110 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
111 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
114 invalid_character (c
)
117 error ("Invalid character: 0%o, %d, 0x%x", c
, c
, c
);
120 /* Parse string STR of length LENGTH and fetch information of a
121 character at STR. Set BYTES to the byte length the character
122 occupies, CHARSET, C1, C2 to proper values of the character. */
124 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
127 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
129 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
130 else if ((bytes) == 2) \
132 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
133 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
135 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
137 else if ((bytes) == 3) \
139 if ((c1) < LEADING_CODE_PRIVATE_11) \
140 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
142 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
145 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
148 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
149 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
150 ((charset) == CHARSET_ASCII \
151 ? ((c1) >= 0 && (c1) <= 0x7F) \
152 : ((charset) == CHARSET_8_BIT_CONTROL \
153 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
154 : ((charset) == CHARSET_8_BIT_GRAPHIC \
155 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
156 : (CHARSET_DIMENSION (charset) == 1 \
157 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
158 : ((c1) >= 0x20 && (c1) <= 0x7F \
159 && (c2) >= 0x20 && (c2) <= 0x7F)))))
161 /* Store multi-byte form of the character C in STR. The caller should
162 allocate at least 4-byte area at STR in advance. Returns the
163 length of the multi-byte form. If C is an invalid character code,
167 char_to_string_1 (c
, str
)
171 unsigned char *p
= str
;
173 if (c
& CHAR_MODIFIER_MASK
) /* This includes the case C is negative. */
175 /* Multibyte character can't have a modifier bit. */
176 if (! SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
179 /* For Meta, Shift, and Control modifiers, we need special care. */
182 /* Move the meta bit to the right place for a string. */
183 c
= (c
& ~CHAR_META
) | 0x80;
187 /* Shift modifier is valid only with [A-Za-z]. */
188 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
190 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
191 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
195 /* Simulate the code in lread.c. */
196 /* Allow `\C- ' and `\C-?'. */
197 if (c
== (CHAR_CTL
| ' '))
199 else if (c
== (CHAR_CTL
| '?'))
201 /* ASCII control chars are made from letters (both cases),
202 as well as the non-letters within 0100...0137. */
203 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
204 c
&= (037 | (~0177 & ~CHAR_CTL
));
205 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
206 c
&= (037 | (~0177 & ~CHAR_CTL
));
209 /* If C still has any modifier bits, just ignore it. */
210 c
&= ~CHAR_MODIFIER_MASK
;
213 if (SINGLE_BYTE_CHAR_P (c
))
215 if (ASCII_BYTE_P (c
) || c
>= 0xA0)
219 *p
++ = LEADING_CODE_8_BIT_CONTROL
;
223 else if (CHAR_VALID_P (c
, 0))
227 SPLIT_CHAR (c
, charset
, c1
, c2
);
229 if (charset
>= LEADING_CODE_EXT_11
)
230 *p
++ = (charset
< LEADING_CODE_EXT_12
231 ? LEADING_CODE_PRIVATE_11
232 : (charset
< LEADING_CODE_EXT_21
233 ? LEADING_CODE_PRIVATE_12
234 : (charset
< LEADING_CODE_EXT_22
235 ? LEADING_CODE_PRIVATE_21
236 : LEADING_CODE_PRIVATE_22
)));
238 if (c1
> 0 && c1
< 32 || c2
> 0 && c2
< 32)
254 /* Store multi-byte form of the character C in STR. The caller should
255 allocate at least 4-byte area at STR in advance. Returns the
256 length of the multi-byte form. If C is an invalid character code,
259 Use macro `CHAR_STRING (C, STR)' instead of calling this function
260 directly if C can be an ASCII character. */
263 char_to_string (c
, str
)
268 len
= char_to_string_1 (c
, str
);
270 invalid_character (c
);
275 /* Return the non-ASCII character corresponding to multi-byte form at
276 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
277 length of the multibyte form in *ACTUAL_LEN.
279 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
280 this function directly if you want ot handle ASCII characters as
284 string_to_char (str
, len
, actual_len
)
285 const unsigned char *str
;
286 int len
, *actual_len
;
288 int c
, bytes
, charset
, c1
, c2
;
290 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, charset
, c1
, c2
);
291 c
= MAKE_CHAR (charset
, c1
, c2
);
297 /* Return the length of the multi-byte form at string STR of length LEN.
298 Use the macro MULTIBYTE_FORM_LENGTH instead. */
300 multibyte_form_length (str
, len
)
301 const unsigned char *str
;
306 PARSE_MULTIBYTE_SEQ (str
, len
, bytes
);
310 /* Check multibyte form at string STR of length LEN and set variables
311 pointed by CHARSET, C1, and C2 to charset and position codes of the
312 character at STR, and return 0. If there's no multibyte character,
313 return -1. This should be used only in the macro SPLIT_STRING
314 which checks range of STR in advance. */
317 split_string (str
, len
, charset
, c1
, c2
)
318 const unsigned char *str
;
319 unsigned char *c1
, *c2
;
322 register int bytes
, cs
, code1
, code2
= -1;
324 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, cs
, code1
, code2
);
325 if (cs
== CHARSET_ASCII
)
333 /* Return 1 iff character C has valid printable glyph.
334 Use the macro CHAR_PRINTABLE_P instead. */
341 if (ASCII_BYTE_P (c
))
343 else if (SINGLE_BYTE_CHAR_P (c
))
345 else if (c
>= MAX_CHAR
)
348 SPLIT_CHAR (c
, charset
, c1
, c2
);
349 if (! CHARSET_DEFINED_P (charset
))
351 if (CHARSET_CHARS (charset
) == 94
352 ? c1
<= 32 || c1
>= 127
355 if (CHARSET_DIMENSION (charset
) == 2
356 && (CHARSET_CHARS (charset
) == 94
357 ? c2
<= 32 || c2
>= 127
363 /* Translate character C by translation table TABLE. If C
364 is negative, translate a character specified by CHARSET, C1, and C2
365 (C1 and C2 are code points of the character). If no translation is
366 found in TABLE, return C. */
368 translate_char (table
, c
, charset
, c1
, c2
)
370 int c
, charset
, c1
, c2
;
373 int alt_charset
, alt_c1
, alt_c2
, dimension
;
375 if (c
< 0) c
= MAKE_CHAR (charset
, (c1
& 0x7F) , (c2
& 0x7F));
376 if (!CHAR_TABLE_P (table
)
377 || (ch
= Faref (table
, make_number (c
)), !NATNUMP (ch
)))
380 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
381 dimension
= CHARSET_DIMENSION (alt_charset
);
382 if (dimension
== 1 && alt_c1
> 0 || dimension
== 2 && alt_c2
> 0)
383 /* CH is not a generic character, just return it. */
384 return XFASTINT (ch
);
386 /* Since CH is a generic character, we must return a specific
387 charater which has the same position codes as C from CH. */
389 SPLIT_CHAR (c
, charset
, c1
, c2
);
390 if (dimension
!= CHARSET_DIMENSION (charset
))
391 /* We can't make such a character because of dimension mismatch. */
393 return MAKE_CHAR (alt_charset
, c1
, c2
);
396 /* Convert the unibyte character C to multibyte based on
397 Vnonascii_translation_table or nonascii_insert_offset. If they can't
398 convert C to a valid multibyte character, convert it based on
399 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
402 unibyte_char_to_multibyte (c
)
405 if (c
< 0400 && c
>= 0200)
409 if (! NILP (Vnonascii_translation_table
))
411 c
= XINT (Faref (Vnonascii_translation_table
, make_number (c
)));
412 if (c
>= 0400 && ! char_valid_p (c
, 0))
413 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
415 else if (c
>= 0240 && nonascii_insert_offset
> 0)
417 c
+= nonascii_insert_offset
;
418 if (c
< 0400 || ! char_valid_p (c
, 0))
419 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
422 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
428 /* Convert the multibyte character C to unibyte 8-bit character based
429 on Vnonascii_translation_table or nonascii_insert_offset. If
430 REV_TBL is non-nil, it should be a reverse table of
431 Vnonascii_translation_table, i.e. what given by:
432 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
435 multibyte_char_to_unibyte (c
, rev_tbl
)
439 if (!SINGLE_BYTE_CHAR_P (c
))
443 if (! CHAR_TABLE_P (rev_tbl
)
444 && CHAR_TABLE_P (Vnonascii_translation_table
))
445 rev_tbl
= Fchar_table_extra_slot (Vnonascii_translation_table
,
447 if (CHAR_TABLE_P (rev_tbl
))
450 temp
= Faref (rev_tbl
, make_number (c
));
454 c
= (c_save
& 0177) + 0200;
458 if (nonascii_insert_offset
> 0)
459 c
-= nonascii_insert_offset
;
460 if (c
< 128 || c
>= 256)
461 c
= (c_save
& 0177) + 0200;
469 /* Update the table Vcharset_table with the given arguments (see the
470 document of `define-charset' for the meaning of each argument).
471 Several other table contents are also updated. The caller should
472 check the validity of CHARSET-ID and the remaining arguments in
476 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
477 iso_final_char
, iso_graphic_plane
,
478 short_name
, long_name
, description
)
479 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
480 Lisp_Object iso_final_char
, iso_graphic_plane
;
481 Lisp_Object short_name
, long_name
, description
;
483 int charset
= XINT (charset_id
);
485 unsigned char leading_code_base
, leading_code_ext
;
487 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
488 CHARSET_TABLE_ENTRY (charset
)
489 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
491 if (NILP (long_name
))
492 long_name
= short_name
;
493 if (NILP (description
))
494 description
= long_name
;
496 /* Get byte length of multibyte form, base leading-code, and
497 extended leading-code of the charset. See the comment under the
498 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
499 bytes
= XINT (dimension
);
500 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
502 /* Official charset, it doesn't have an extended leading-code. */
503 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
)
504 bytes
+= 1; /* For a base leading-code. */
505 leading_code_base
= charset
;
506 leading_code_ext
= 0;
510 /* Private charset. */
511 bytes
+= 2; /* For base and extended leading-codes. */
513 = (charset
< LEADING_CODE_EXT_12
514 ? LEADING_CODE_PRIVATE_11
515 : (charset
< LEADING_CODE_EXT_21
516 ? LEADING_CODE_PRIVATE_12
517 : (charset
< LEADING_CODE_EXT_22
518 ? LEADING_CODE_PRIVATE_21
519 : LEADING_CODE_PRIVATE_22
)));
520 leading_code_ext
= charset
;
521 if (BYTES_BY_CHAR_HEAD (leading_code_base
) != bytes
)
522 error ("Invalid dimension for the charset-ID %d", charset
);
525 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
526 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
527 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
528 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
529 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
530 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
531 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
532 = make_number (leading_code_base
);
533 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
534 = make_number (leading_code_ext
);
535 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
536 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
538 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
539 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
540 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
541 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
544 /* If we have already defined a charset which has the same
545 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
546 DIRECTION, we must update the entry REVERSE-CHARSET of both
547 charsets. If there's no such charset, the value of the entry
551 for (i
= 0; i
<= MAX_CHARSET
; i
++)
552 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
554 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
555 && CHARSET_CHARS (i
) == XINT (chars
)
556 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
557 && CHARSET_DIRECTION (i
) != XINT (direction
))
559 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
561 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
566 /* No such a charset. */
567 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
571 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
572 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
574 bytes_by_char_head
[leading_code_base
] = bytes
;
575 width_by_char_head
[leading_code_base
] = XINT (width
);
577 /* Update table emacs_code_class. */
578 emacs_code_class
[charset
] = (bytes
== 2
579 ? EMACS_leading_code_2
581 ? EMACS_leading_code_3
582 : EMACS_leading_code_4
));
585 /* Update table iso_charset_table. */
586 if (XINT (iso_final_char
) >= 0
587 && ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
588 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
593 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
596 get_charset_id (charset_symbol
)
597 Lisp_Object charset_symbol
;
602 /* This originally used a ?: operator, but reportedly the HP-UX
603 compiler version HP92453-01 A.10.32.22 miscompiles that. */
604 if (SYMBOLP (charset_symbol
)
605 && VECTORP (val
= Fget (charset_symbol
, Qcharset
))
606 && CHARSET_VALID_P (charset
=
607 XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
])))
613 /* Return an identification number for a new private charset of
614 DIMENSION and WIDTH. If there's no more room for the new charset,
617 get_new_private_charset_id (dimension
, width
)
618 int dimension
, width
;
620 int charset
, from
, to
;
624 from
= LEADING_CODE_EXT_11
;
625 to
= LEADING_CODE_EXT_21
;
629 from
= LEADING_CODE_EXT_21
;
630 to
= LEADING_CODE_EXT_MAX
+ 1;
633 for (charset
= from
; charset
< to
; charset
++)
634 if (!CHARSET_DEFINED_P (charset
)) break;
636 return make_number (charset
< to
? charset
: 0);
639 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
640 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
641 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
642 treated as a private charset.\n\
643 INFO-VECTOR is a vector of the format:\n\
644 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
645 SHORT-NAME LONG-NAME DESCRIPTION]\n\
646 The meanings of each elements is as follows:\n\
647 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
648 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
649 WIDTH (integer) is the number of columns a character in the charset\n\
650 occupies on the screen: one of 0, 1, and 2.\n\
652 DIRECTION (integer) is the rendering direction of characters in the\n\
653 charset when rendering. If 0, render from left to right, else\n\
654 render from right to left.\n\
656 ISO-FINAL-CHAR (character) is the final character of the\n\
657 corresponding ISO 2022 charset.\n\
658 It may be -1 if the charset is internal use only.\n\
660 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
661 while encoding to variants of ISO 2022 coding system, one of the\n\
662 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
663 It may be -1 if the charset is internal use only.\n\
665 SHORT-NAME (string) is the short name to refer to the charset.\n\
667 LONG-NAME (string) is the long name to refer to the charset.\n\
669 DESCRIPTION (string) is the description string of the charset.")
670 (charset_id
, charset_symbol
, info_vector
)
671 Lisp_Object charset_id
, charset_symbol
, info_vector
;
675 if (!NILP (charset_id
))
676 CHECK_NUMBER (charset_id
, 0);
677 CHECK_SYMBOL (charset_symbol
, 1);
678 CHECK_VECTOR (info_vector
, 2);
680 if (! NILP (charset_id
))
682 if (! CHARSET_VALID_P (XINT (charset_id
)))
683 error ("Invalid CHARSET: %d", XINT (charset_id
));
684 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
685 error ("Already defined charset: %d", XINT (charset_id
));
688 vec
= XVECTOR (info_vector
)->contents
;
689 if (XVECTOR (info_vector
)->size
!= 9
690 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
691 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
692 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
693 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
694 || !INTEGERP (vec
[4])
695 || !(XINT (vec
[4]) == -1 || XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~')
696 || !INTEGERP (vec
[5])
697 || !(XINT (vec
[5]) == -1 || XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
700 || !STRINGP (vec
[8]))
701 error ("Invalid info-vector argument for defining charset %s",
702 XSYMBOL (charset_symbol
)->name
->data
);
704 if (NILP (charset_id
))
706 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
707 if (XINT (charset_id
) == 0)
708 error ("There's no room for a new private charset %s",
709 XSYMBOL (charset_symbol
)->name
->data
);
712 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
713 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
714 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
715 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
716 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
717 Fupdate_coding_systems_internal ();
721 DEFUN ("generic-character-list", Fgeneric_character_list
,
722 Sgeneric_character_list
, 0, 0, 0,
723 "Return a list of all possible generic characters.\n\
724 It includes a generic character for a charset not yet defined.")
727 return Vgeneric_character_list
;
730 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
731 Sget_unused_iso_final_char
, 2, 2, 0,
732 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
733 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
734 CHARS is the number of characters in a dimension: 94 or 96.\n\
736 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
737 If there's no unused final char for the specified kind of charset,\n\
740 Lisp_Object dimension
, chars
;
744 CHECK_NUMBER (dimension
, 0);
745 CHECK_NUMBER (chars
, 1);
746 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
747 error ("Invalid charset dimension %d, it should be 1 or 2",
749 if (XINT (chars
) != 94 && XINT (chars
) != 96)
750 error ("Invalid charset chars %d, it should be 94 or 96",
752 for (final_char
= '0'; final_char
<= '?'; final_char
++)
754 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
757 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
760 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
762 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
763 CHARSET should be defined by `defined-charset' in advance.")
764 (dimension
, chars
, final_char
, charset_symbol
)
765 Lisp_Object dimension
, chars
, final_char
, charset_symbol
;
769 CHECK_NUMBER (dimension
, 0);
770 CHECK_NUMBER (chars
, 1);
771 CHECK_NUMBER (final_char
, 2);
772 CHECK_SYMBOL (charset_symbol
, 3);
774 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
775 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
776 if (XINT (chars
) != 94 && XINT (chars
) != 96)
777 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
778 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
779 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
780 if ((charset
= get_charset_id (charset_symbol
)) < 0)
781 error ("Invalid charset %s", XSYMBOL (charset_symbol
)->name
->data
);
783 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset
;
787 /* Return information about charsets in the text at PTR of NBYTES
788 bytes, which are NCHARS characters. The value is:
790 0: Each character is represented by one byte. This is always
791 true for unibyte text.
792 1: No charsets other than ascii eight-bit-control,
793 eight-bit-graphic, and latin-1 are found.
796 In addition, if CHARSETS is nonzero, for each found charset N, set
797 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
798 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
799 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
800 1 (note that there's no charset whose ID is 1). */
803 find_charset_in_text (ptr
, nchars
, nbytes
, charsets
, table
)
805 int nchars
, nbytes
, *charsets
;
808 if (nchars
== nbytes
)
810 if (charsets
&& nbytes
> 0)
812 unsigned char *endp
= ptr
+ nbytes
;
815 while (ptr
< endp
&& maskbits
!= 7)
817 maskbits
|= (*ptr
< 0x80 ? 1 : *ptr
< 0xA0 ? 2 : 4);
822 charsets
[CHARSET_ASCII
] = 1;
824 charsets
[CHARSET_8_BIT_CONTROL
] = 1;
826 charsets
[CHARSET_8_BIT_GRAPHIC
] = 1;
833 int bytes
, charset
, c1
, c2
;
835 if (! CHAR_TABLE_P (table
))
840 SPLIT_MULTIBYTE_SEQ (ptr
, len
, bytes
, charset
, c1
, c2
);
843 if (!CHARSET_DEFINED_P (charset
))
845 else if (! NILP (table
))
847 int c
= translate_char (table
, -1, charset
, c1
, c2
);
849 charset
= CHAR_CHARSET (c
);
853 && charset
!= CHARSET_ASCII
854 && charset
!= CHARSET_8_BIT_CONTROL
855 && charset
!= CHARSET_8_BIT_GRAPHIC
856 && charset
!= charset_latin_iso8859_1
)
860 charsets
[charset
] = 1;
861 else if (return_val
== 2)
868 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
870 "Return a list of charsets in the region between BEG and END.\n\
871 BEG and END are buffer positions.\n\
872 Optional arg TABLE if non-nil is a translation table to look up.\n\
874 If the region contains invalid multibyte characters,\n\
875 `unknown' is included in the returned list.\n\
877 If the current buffer is unibyte, the returned list may contain\n\
878 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
880 Lisp_Object beg
, end
, table
;
882 int charsets
[MAX_CHARSET
+ 1];
883 int from
, from_byte
, to
, stop
, stop_byte
, i
;
886 validate_region (&beg
, &end
);
887 from
= XFASTINT (beg
);
888 stop
= to
= XFASTINT (end
);
890 if (from
< GPT
&& GPT
< to
)
893 stop_byte
= GPT_BYTE
;
896 stop_byte
= CHAR_TO_BYTE (stop
);
898 from_byte
= CHAR_TO_BYTE (from
);
900 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
903 find_charset_in_text (BYTE_POS_ADDR (from_byte
), stop
- from
,
904 stop_byte
- from_byte
, charsets
, table
);
907 from
= stop
, from_byte
= stop_byte
;
908 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
916 val
= Fcons (Qunknown
, val
);
917 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
919 val
= Fcons (CHARSET_SYMBOL (i
), val
);
921 val
= Fcons (Qascii
, val
);
925 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
927 "Return a list of charsets in STR.\n\
928 Optional arg TABLE if non-nil is a translation table to look up.\n\
930 If the string contains invalid multibyte characters,\n\
931 `unknown' is included in the returned list.\n\
933 If STR is unibyte, the returned list may contain\n\
934 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
936 Lisp_Object str
, table
;
938 int charsets
[MAX_CHARSET
+ 1];
942 CHECK_STRING (str
, 0);
944 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
945 find_charset_in_text (XSTRING (str
)->data
, XSTRING (str
)->size
,
946 STRING_BYTES (XSTRING (str
)), charsets
, table
);
950 val
= Fcons (Qunknown
, val
);
951 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
953 val
= Fcons (CHARSET_SYMBOL (i
), val
);
955 val
= Fcons (Qascii
, val
);
960 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
962 (charset
, code1
, code2
)
963 Lisp_Object charset
, code1
, code2
;
965 int charset_id
, c1
, c2
;
967 CHECK_NUMBER (charset
, 0);
968 charset_id
= XINT (charset
);
969 if (!CHARSET_DEFINED_P (charset_id
))
970 error ("Invalid charset ID: %d", XINT (charset
));
976 CHECK_NUMBER (code1
, 1);
983 CHECK_NUMBER (code2
, 2);
987 if (charset_id
== CHARSET_ASCII
)
989 if (c1
< 0 || c1
> 0x7F)
990 goto invalid_code_posints
;
991 return make_number (c1
);
993 else if (charset_id
== CHARSET_8_BIT_CONTROL
)
997 else if (c1
< 0x80 || c1
> 0x9F)
998 goto invalid_code_posints
;
999 return make_number (c1
);
1001 else if (charset_id
== CHARSET_8_BIT_GRAPHIC
)
1005 else if (c1
< 0xA0 || c1
> 0xFF)
1006 goto invalid_code_posints
;
1007 return make_number (c1
);
1009 else if (c1
< 0 || c1
> 0xFF || c2
< 0 || c2
> 0xFF)
1010 goto invalid_code_posints
;
1016 ? !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, 0x20)
1017 : !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, c2
)))
1018 goto invalid_code_posints
;
1019 return make_number (MAKE_CHAR (charset_id
, c1
, c2
));
1021 invalid_code_posints
:
1022 error ("Invalid code points for charset ID %d: %d %d", charset_id
, c1
, c2
);
1025 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
1026 "Return list of charset and one or two position-codes of CHAR.\n\
1027 If CHAR is invalid as a character code,\n\
1028 return a list of symbol `unknown' and CHAR.")
1032 int c
, charset
, c1
, c2
;
1034 CHECK_NUMBER (ch
, 0);
1036 if (!CHAR_VALID_P (c
, 1))
1037 return Fcons (Qunknown
, Fcons (ch
, Qnil
));
1038 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
1040 ? Fcons (CHARSET_SYMBOL (charset
),
1041 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
1042 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
1045 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
1046 "Return charset of CHAR.")
1050 CHECK_NUMBER (ch
, 0);
1052 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
1055 DEFUN ("charset-after", Fcharset_after
, Scharset_after
, 0, 1, 0,
1056 "Return charset of a character in the current buffer at position POS.\n\
1057 If POS is nil, it defauls to the current point.\n\
1058 If POS is out of range, the value is nil.")
1065 ch
= Fchar_after (pos
);
1066 if (! INTEGERP (ch
))
1068 charset
= CHAR_CHARSET (XINT (ch
));
1069 return CHARSET_SYMBOL (charset
);
1072 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
1073 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1075 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1076 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1077 where as Emacs distinguishes them by charset symbol.\n\
1078 See the documentation of the function `charset-info' for the meanings of\n\
1079 DIMENSION, CHARS, and FINAL-CHAR.")
1080 (dimension
, chars
, final_char
)
1081 Lisp_Object dimension
, chars
, final_char
;
1085 CHECK_NUMBER (dimension
, 0);
1086 CHECK_NUMBER (chars
, 1);
1087 CHECK_NUMBER (final_char
, 2);
1089 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
1091 return CHARSET_SYMBOL (charset
);
1094 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1095 generic character. If GENERICP is zero, return nonzero iff C is a
1096 valid normal character. Do not call this function directly,
1097 instead use macro CHAR_VALID_P. */
1099 char_valid_p (c
, genericp
)
1102 int charset
, c1
, c2
;
1104 if (c
< 0 || c
>= MAX_CHAR
)
1106 if (SINGLE_BYTE_CHAR_P (c
))
1108 SPLIT_CHAR (c
, charset
, c1
, c2
);
1113 if (c2
<= 0) c2
= 0x20;
1117 if (c2
<= 0) c1
= c2
= 0x20;
1120 return (CHARSET_DEFINED_P (charset
)
1121 && CHAR_COMPONENTS_VALID_P (charset
, c1
, c2
));
1124 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
1125 "Return t if OBJECT is a valid normal character.\n\
1126 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1127 a valid generic character.")
1129 Lisp_Object object
, genericp
;
1131 if (! NATNUMP (object
))
1133 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
1136 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
1137 Sunibyte_char_to_multibyte
, 1, 1, 0,
1138 "Convert the unibyte character CH to multibyte character.\n\
1139 The conversion is done based on `nonascii-translation-table' (which see)\n\
1140 or `nonascii-insert-offset' (which see).")
1146 CHECK_NUMBER (ch
, 0);
1148 if (c
< 0 || c
>= 0400)
1149 error ("Invalid unibyte character: %d", c
);
1150 c
= unibyte_char_to_multibyte (c
);
1152 error ("Can't convert to multibyte character: %d", XINT (ch
));
1153 return make_number (c
);
1156 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
1157 Smultibyte_char_to_unibyte
, 1, 1, 0,
1158 "Convert the multibyte character CH to unibyte character.\n\
1159 The conversion is done based on `nonascii-translation-table' (which see)\n\
1160 or `nonascii-insert-offset' (which see).")
1166 CHECK_NUMBER (ch
, 0);
1168 if (! CHAR_VALID_P (c
, 0))
1169 error ("Invalid multibyte character: %d", c
);
1170 c
= multibyte_char_to_unibyte (c
, Qnil
);
1172 error ("Can't convert to unibyte character: %d", XINT (ch
));
1173 return make_number (c
);
1176 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
1177 "Return 1 regardless of the argument CHAR.\n\
1178 This is now an obsolete function. We keep it just for backward compatibility.")
1182 CHECK_NUMBER (ch
, 0);
1183 return make_number (1);
1186 /* Return how many bytes C will occupy in a multibyte buffer.
1187 Don't call this function directly, instead use macro CHAR_BYTES. */
1194 if (ASCII_BYTE_P (c
) || (c
& ~((1 << CHARACTERBITS
) -1)))
1196 if (SINGLE_BYTE_CHAR_P (c
) && c
>= 0xA0)
1199 charset
= CHAR_CHARSET (c
);
1200 return (CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1);
1203 /* Return the width of character of which multi-byte form starts with
1204 C. The width is measured by how many columns occupied on the
1205 screen when displayed in the current buffer. */
1207 #define ONE_BYTE_CHAR_WIDTH(c) \
1210 ? XFASTINT (current_buffer->tab_width) \
1211 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1215 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1216 : ((! NILP (current_buffer->enable_multibyte_characters) \
1217 && BASE_LEADING_CODE_P (c)) \
1218 ? WIDTH_BY_CHAR_HEAD (c) \
1221 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
1222 "Return width of CHAR when displayed in the current buffer.\n\
1223 The width is measured by how many columns it occupies on the screen.\n\
1224 Tab is taken to occupy `tab-width' columns.")
1228 Lisp_Object val
, disp
;
1230 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1232 CHECK_NUMBER (ch
, 0);
1236 /* Get the way the display table would display it. */
1237 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
1240 XSETINT (val
, XVECTOR (disp
)->size
);
1241 else if (SINGLE_BYTE_CHAR_P (c
))
1242 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
1245 int charset
= CHAR_CHARSET (c
);
1247 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
1252 /* Return width of string STR of length LEN when displayed in the
1253 current buffer. The width is measured by how many columns it
1254 occupies on the screen. */
1261 return c_string_width (str
, len
, -1, NULL
, NULL
);
1264 /* Return width of string STR of length LEN when displayed in the
1265 current buffer. The width is measured by how many columns it
1266 occupies on the screen. If PRECISION > 0, return the width of
1267 longest substring that doesn't exceed PRECISION, and set number of
1268 characters and bytes of the substring in *NCHARS and *NBYTES
1272 c_string_width (str
, len
, precision
, nchars
, nbytes
)
1274 int precision
, *nchars
, *nbytes
;
1276 int i
= 0, i_byte
= 0;
1279 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1281 while (i_byte
< len
)
1283 int bytes
, thiswidth
;
1288 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1291 val
= DISP_CHAR_VECTOR (dp
, c
);
1293 thiswidth
= XVECTOR (val
)->size
;
1295 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1300 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len
- i_byte
, bytes
);
1301 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1305 && (width
+ thiswidth
> precision
))
1325 /* Return width of Lisp string STRING when displayed in the current
1326 buffer. The width is measured by how many columns it occupies on
1327 the screen while paying attention to compositions. If PRECISION >
1328 0, return the width of longest substring that doesn't exceed
1329 PRECISION, and set number of characters and bytes of the substring
1330 in *NCHARS and *NBYTES respectively. */
1333 lisp_string_width (string
, precision
, nchars
, nbytes
)
1335 int precision
, *nchars
, *nbytes
;
1337 int len
= XSTRING (string
)->size
;
1338 int len_byte
= STRING_BYTES (XSTRING (string
));
1339 unsigned char *str
= XSTRING (string
)->data
;
1340 int i
= 0, i_byte
= 0;
1342 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1346 int chars
, bytes
, thiswidth
;
1351 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
1352 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
1355 thiswidth
= composition_table
[cmp_id
]->width
;
1357 bytes
= string_char_to_byte (string
, end
) - i_byte
;
1361 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1364 val
= DISP_CHAR_VECTOR (dp
, c
);
1366 thiswidth
= XVECTOR (val
)->size
;
1368 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1373 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len_byte
- i_byte
, bytes
);
1374 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1378 && (width
+ thiswidth
> precision
))
1398 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1399 "Return width of STRING when displayed in the current buffer.\n\
1400 Width is measured by how many columns it occupies on the screen.\n\
1401 When calculating width of a multibyte character in STRING,\n\
1402 only the base leading-code is considered; the validity of\n\
1403 the following bytes is not checked. Tabs in STRING are always\n\
1404 taken to occupy `tab-width' columns.")
1410 CHECK_STRING (str
, 0);
1411 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
1415 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1416 "Return the direction of CHAR.\n\
1417 The returned value is 0 for left-to-right and 1 for right-to-left.")
1423 CHECK_NUMBER (ch
, 0);
1424 charset
= CHAR_CHARSET (XFASTINT (ch
));
1425 if (!CHARSET_DEFINED_P (charset
))
1426 invalid_character (XINT (ch
));
1427 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1430 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
1431 "Return number of characters between BEG and END.")
1433 Lisp_Object beg
, end
;
1437 CHECK_NUMBER_COERCE_MARKER (beg
, 0);
1438 CHECK_NUMBER_COERCE_MARKER (end
, 1);
1440 from
= min (XFASTINT (beg
), XFASTINT (end
));
1441 to
= max (XFASTINT (beg
), XFASTINT (end
));
1443 return make_number (to
- from
);
1446 /* Return the number of characters in the NBYTES bytes at PTR.
1447 This works by looking at the contents and checking for multibyte sequences.
1448 However, if the current buffer has enable-multibyte-characters = nil,
1449 we treat each byte as a character. */
1452 chars_in_text (ptr
, nbytes
)
1456 /* current_buffer is null at early stages of Emacs initialization. */
1457 if (current_buffer
== 0
1458 || NILP (current_buffer
->enable_multibyte_characters
))
1461 return multibyte_chars_in_text (ptr
, nbytes
);
1464 /* Return the number of characters in the NBYTES bytes at PTR.
1465 This works by looking at the contents and checking for multibyte sequences.
1466 It ignores enable-multibyte-characters. */
1469 multibyte_chars_in_text (ptr
, nbytes
)
1473 unsigned char *endp
;
1476 endp
= ptr
+ nbytes
;
1481 PARSE_MULTIBYTE_SEQ (ptr
, endp
- ptr
, bytes
);
1489 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1490 count the numbers of characters and bytes in it. On counting
1491 bytes, pay attention to the fact that 8-bit characters in the range
1492 0x80..0x9F are represented by 2 bytes in multibyte text. */
1494 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
1496 int len
, *nchars
, *nbytes
;
1498 unsigned char *endp
= str
+ len
;
1499 int n
, chars
= 0, bytes
= 0;
1503 if (UNIBYTE_STR_AS_MULTIBYTE_P (str
, endp
- str
, n
))
1504 str
+= n
, bytes
+= n
;
1514 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1515 It actually converts only 8-bit characters in the range 0x80..0x9F
1516 that don't contruct multibyte characters to multibyte forms. If
1517 NCHARS is nonzero, set *NCHARS to the number of characters in the
1518 text. It is assured that we can use LEN bytes at STR as a work
1519 area and that is enough. Return the number of bytes of the
1523 str_as_multibyte (str
, len
, nbytes
, nchars
)
1525 int len
, nbytes
, *nchars
;
1527 unsigned char *p
= str
, *endp
= str
+ nbytes
;
1532 while (p
< endp
&& UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1542 safe_bcopy (p
, endp
- nbytes
, nbytes
);
1546 if (UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1553 *to
++ = LEADING_CODE_8_BIT_CONTROL
;
1554 *to
++ = *p
++ + 0x20;
1563 /* Parse unibyte string at STR of LEN bytes, and return the number of
1564 bytes it may ocupy when converted to multibyte string by
1565 `str_to_multibyte'. */
1568 parse_str_to_multibyte (str
, len
)
1572 unsigned char *endp
= str
+ len
;
1575 for (bytes
= 0; str
< endp
; str
++)
1576 bytes
+= (*str
< 0x80 || *str
>= 0xA0) ? 1 : 2;
1580 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1581 that contains the same single-byte characters. It actually
1582 converts all 8-bit characters to multibyte forms. It is assured
1583 that we can use LEN bytes at STR as a work area and that is
1587 str_to_multibyte (str
, len
, bytes
)
1591 unsigned char *p
= str
, *endp
= str
+ bytes
;
1594 while (p
< endp
&& (*p
< 0x80 || *p
>= 0xA0)) p
++;
1600 safe_bcopy (p
, endp
- bytes
, bytes
);
1604 if (*p
< 0x80 || *p
>= 0xA0)
1607 *to
++ = LEADING_CODE_8_BIT_CONTROL
, *to
++ = *p
++ + 0x20;
1612 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1613 actually converts only 8-bit characters in the range 0x80..0x9F to
1617 str_as_unibyte (str
, bytes
)
1621 unsigned char *p
= str
, *endp
= str
+ bytes
;
1622 unsigned char *to
= str
;
1624 while (p
< endp
&& *p
!= LEADING_CODE_8_BIT_CONTROL
) p
++;
1628 if (*p
== LEADING_CODE_8_BIT_CONTROL
)
1629 *to
++ = *(p
+ 1) - 0x20, p
+= 2;
1637 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
1638 "Concatenate all the argument characters and make the result a string.")
1644 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
1645 unsigned char *p
= buf
;
1649 for (i
= 0; i
< n
; i
++)
1651 CHECK_NUMBER (args
[i
], 0);
1652 if (!multibyte
&& !SINGLE_BYTE_CHAR_P (XFASTINT (args
[i
])))
1656 for (i
= 0; i
< n
; i
++)
1660 p
+= CHAR_STRING (c
, p
);
1665 return make_string_from_bytes (buf
, n
, p
- buf
);
1671 charset_id_internal (charset_name
)
1676 val
= Fget (intern (charset_name
), Qcharset
);
1678 error ("Charset %s is not defined", charset_name
);
1680 return (XINT (XVECTOR (val
)->contents
[0]));
1683 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1684 Ssetup_special_charsets
, 0, 0, 0, "Internal use only.")
1687 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1688 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1689 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1690 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1691 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1692 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1693 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1698 init_charset_once ()
1702 staticpro (&Vcharset_table
);
1703 staticpro (&Vcharset_symbol_table
);
1704 staticpro (&Vgeneric_character_list
);
1706 /* This has to be done here, before we call Fmake_char_table. */
1707 Qcharset_table
= intern ("charset-table");
1708 staticpro (&Qcharset_table
);
1710 /* Intern this now in case it isn't already done.
1711 Setting this variable twice is harmless.
1712 But don't staticpro it here--that is done in alloc.c. */
1713 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1715 /* Now we are ready to set up this property, so we can
1716 create the charset table. */
1717 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1718 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1720 Qunknown
= intern ("unknown");
1721 staticpro (&Qunknown
);
1722 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1),
1726 for (i
= 0; i
< 2; i
++)
1727 for (j
= 0; j
< 2; j
++)
1728 for (k
= 0; k
< 128; k
++)
1729 iso_charset_table
[i
][j
][k
] = -1;
1731 for (i
= 0; i
< 256; i
++)
1732 bytes_by_char_head
[i
] = 1;
1733 bytes_by_char_head
[LEADING_CODE_PRIVATE_11
] = 3;
1734 bytes_by_char_head
[LEADING_CODE_PRIVATE_12
] = 3;
1735 bytes_by_char_head
[LEADING_CODE_PRIVATE_21
] = 4;
1736 bytes_by_char_head
[LEADING_CODE_PRIVATE_22
] = 4;
1738 for (i
= 0; i
< 128; i
++)
1739 width_by_char_head
[i
] = 1;
1740 for (; i
< 256; i
++)
1741 width_by_char_head
[i
] = 4;
1742 width_by_char_head
[LEADING_CODE_PRIVATE_11
] = 1;
1743 width_by_char_head
[LEADING_CODE_PRIVATE_12
] = 2;
1744 width_by_char_head
[LEADING_CODE_PRIVATE_21
] = 1;
1745 width_by_char_head
[LEADING_CODE_PRIVATE_22
] = 2;
1751 for (i
= 0x81; i
< 0x90; i
++)
1752 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1753 for (; i
< 0x9A; i
++)
1754 val
= Fcons (make_number ((i
- 0x8F) << 14), val
);
1755 for (i
= 0xA0; i
< 0xF0; i
++)
1756 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1757 for (; i
< 0xFF; i
++)
1758 val
= Fcons (make_number ((i
- 0xE0) << 14), val
);
1759 Vgeneric_character_list
= Fnreverse (val
);
1762 nonascii_insert_offset
= 0;
1763 Vnonascii_translation_table
= Qnil
;
1771 Qcharset
= intern ("charset");
1772 staticpro (&Qcharset
);
1774 Qascii
= intern ("ascii");
1775 staticpro (&Qascii
);
1777 Qeight_bit_control
= intern ("eight-bit-control");
1778 staticpro (&Qeight_bit_control
);
1780 Qeight_bit_graphic
= intern ("eight-bit-graphic");
1781 staticpro (&Qeight_bit_graphic
);
1783 /* Define special charsets ascii, eight-bit-control, and
1784 eight-bit-graphic. */
1785 update_charset_table (make_number (CHARSET_ASCII
),
1786 make_number (1), make_number (94),
1791 build_string ("ASCII"),
1792 Qnil
, /* same as above */
1793 build_string ("ASCII (ISO646 IRV)"));
1794 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1795 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1797 update_charset_table (make_number (CHARSET_8_BIT_CONTROL
),
1798 make_number (1), make_number (96),
1803 build_string ("8-bit control code (0x80..0x9F)"),
1804 Qnil
, /* same as above */
1805 Qnil
); /* same as above */
1806 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL
) = Qeight_bit_control
;
1807 Fput (Qeight_bit_control
, Qcharset
,
1808 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL
));
1810 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC
),
1811 make_number (1), make_number (96),
1816 build_string ("8-bit graphic char (0xA0..0xFF)"),
1817 Qnil
, /* same as above */
1818 Qnil
); /* same as above */
1819 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC
) = Qeight_bit_graphic
;
1820 Fput (Qeight_bit_graphic
, Qcharset
,
1821 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC
));
1823 Qauto_fill_chars
= intern ("auto-fill-chars");
1824 staticpro (&Qauto_fill_chars
);
1825 Fput (Qauto_fill_chars
, Qchar_table_extra_slots
, make_number (0));
1827 defsubr (&Sdefine_charset
);
1828 defsubr (&Sgeneric_character_list
);
1829 defsubr (&Sget_unused_iso_final_char
);
1830 defsubr (&Sdeclare_equiv_charset
);
1831 defsubr (&Sfind_charset_region
);
1832 defsubr (&Sfind_charset_string
);
1833 defsubr (&Smake_char_internal
);
1834 defsubr (&Ssplit_char
);
1835 defsubr (&Schar_charset
);
1836 defsubr (&Scharset_after
);
1837 defsubr (&Siso_charset
);
1838 defsubr (&Schar_valid_p
);
1839 defsubr (&Sunibyte_char_to_multibyte
);
1840 defsubr (&Smultibyte_char_to_unibyte
);
1841 defsubr (&Schar_bytes
);
1842 defsubr (&Schar_width
);
1843 defsubr (&Sstring_width
);
1844 defsubr (&Schar_direction
);
1845 defsubr (&Schars_in_region
);
1847 defsubr (&Ssetup_special_charsets
);
1849 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1850 "List of charsets ever defined.");
1851 Vcharset_list
= Fcons (Qascii
, Fcons (Qeight_bit_control
,
1852 Fcons (Qeight_bit_graphic
, Qnil
)));
1854 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
1855 "Vector of cons cell of a symbol and translation table ever defined.\n\
1856 An ID of a translation table is an index of this vector.");
1857 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
1859 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1860 "Leading-code of private TYPE9N charset of column-width 1.");
1861 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1863 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1864 "Leading-code of private TYPE9N charset of column-width 2.");
1865 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1867 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1868 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1869 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1871 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1872 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1873 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1875 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1876 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1877 This is used for converting unibyte text to multibyte,\n\
1878 and for inserting character codes specified by number.\n\n\
1879 This serves to convert a Latin-1 or similar 8-bit character code\n\
1880 to the corresponding Emacs multibyte character code.\n\
1881 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1882 for your choice of character set.\n\
1883 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1884 nonascii_insert_offset
= 0;
1886 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table
,
1887 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1888 This is used for converting unibyte text to multibyte,\n\
1889 and for inserting character codes specified by number.\n\n\
1890 Conversion is performed only when multibyte characters are enabled,\n\
1891 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1892 to the corresponding Emacs character code.\n\n\
1893 If this is nil, `nonascii-insert-offset' is used instead.\n\
1894 See also the docstring of `make-translation-table'.");
1895 Vnonascii_translation_table
= Qnil
;
1897 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
1898 "A char-table for characters which invoke auto-filling.\n\
1899 Such characters have value t in this table.");
1900 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
1901 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
1902 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);