1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* At first, see the document in `charset.h' to understand the code in
33 #include <sys/types.h>
37 #include "composite.h"
47 Lisp_Object Qcharset
, Qascii
, Qeight_bit_control
, Qeight_bit_graphic
;
50 /* Declaration of special leading-codes. */
51 int leading_code_private_11
; /* for private DIMENSION1 of 1-column */
52 int leading_code_private_12
; /* for private DIMENSION1 of 2-column */
53 int leading_code_private_21
; /* for private DIMENSION2 of 1-column */
54 int leading_code_private_22
; /* for private DIMENSION2 of 2-column */
56 /* Declaration of special charsets. The values are set by
57 Fsetup_special_charsets. */
58 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
59 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
60 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
61 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
62 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
63 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
64 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
66 Lisp_Object Qcharset_table
;
68 /* A char-table containing information of each character set. */
69 Lisp_Object Vcharset_table
;
71 /* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73 Lisp_Object Vcharset_symbol_table
;
75 /* A list of charset symbols ever defined. */
76 Lisp_Object Vcharset_list
;
78 /* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80 Lisp_Object Vtranslation_table_vector
;
82 /* A char-table for characters which may invoke auto-filling. */
83 Lisp_Object Vauto_fill_chars
;
85 Lisp_Object Qauto_fill_chars
;
87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88 int bytes_by_char_head
[256];
89 int width_by_char_head
[256];
91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93 int iso_charset_table
[2][2][128];
95 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
96 unsigned char *_fetch_multibyte_char_p
;
97 int _fetch_multibyte_char_len
;
99 /* Offset to add to a non-ASCII value when inserting it. */
100 int nonascii_insert_offset
;
102 /* Translation table for converting non-ASCII unibyte characters
103 to multibyte codes, or nil. */
104 Lisp_Object Vnonascii_translation_table
;
106 /* List of all possible generic characters. */
107 Lisp_Object Vgeneric_character_list
;
109 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
110 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
113 invalid_character (c
)
116 error ("Invalid character: 0%o, %d, 0x%x", c
, c
, c
);
119 /* Parse string STR of length LENGTH and fetch information of a
120 character at STR. Set BYTES to the byte length the character
121 occupies, CHARSET, C1, C2 to proper values of the character. */
123 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
126 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
128 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
129 else if ((bytes) == 2) \
131 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
132 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
134 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
136 else if ((bytes) == 3) \
138 if ((c1) < LEADING_CODE_PRIVATE_11) \
139 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
141 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
144 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
147 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
148 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
149 ((charset) == CHARSET_ASCII \
150 ? ((c1) >= 0 && (c1) <= 0x7F) \
151 : ((charset) == CHARSET_8_BIT_CONTROL \
152 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
153 : ((charset) == CHARSET_8_BIT_GRAPHIC \
154 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
155 : (CHARSET_DIMENSION (charset) == 1 \
156 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
157 : ((c1) >= 0x20 && (c1) <= 0x7F \
158 && (c2) >= 0x20 && (c2) <= 0x7F)))))
160 /* Store multi-byte form of the character C in STR. The caller should
161 allocate at least 4-byte area at STR in advance. Returns the
162 length of the multi-byte form. If C is an invalid character code,
165 Use macro `CHAR_STRING (C, STR)' instead of calling this function
166 directly if C can be an ASCII character. */
169 char_to_string (c
, str
)
173 unsigned char *p
= str
;
175 if (c
& CHAR_MODIFIER_MASK
) /* This includes the case C is negative. */
177 /* Multibyte character can't have a modifier bit. */
178 if (! SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
179 invalid_character (c
);
181 /* For Meta, Shift, and Control modifiers, we need special care. */
184 /* Move the meta bit to the right place for a string. */
185 c
= (c
& ~CHAR_META
) | 0x80;
189 /* Shift modifier is valid only with [A-Za-z]. */
190 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
192 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
193 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
197 /* Simulate the code in lread.c. */
198 /* Allow `\C- ' and `\C-?'. */
199 if (c
== (CHAR_CTL
| ' '))
201 else if (c
== (CHAR_CTL
| '?'))
203 /* ASCII control chars are made from letters (both cases),
204 as well as the non-letters within 0100...0137. */
205 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
206 c
&= (037 | (~0177 & ~CHAR_CTL
));
207 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
208 c
&= (037 | (~0177 & ~CHAR_CTL
));
211 /* If C still has any modifier bits, it is an invalid character. */
212 if (c
& CHAR_MODIFIER_MASK
)
213 invalid_character (c
);
215 if (SINGLE_BYTE_CHAR_P (c
))
217 if (ASCII_BYTE_P (c
) || c
>= 0xA0)
221 *p
++ = LEADING_CODE_8_BIT_CONTROL
;
225 else if (CHAR_VALID_P (c
, 0))
229 SPLIT_CHAR (c
, charset
, c1
, c2
);
231 if (charset
>= LEADING_CODE_EXT_11
)
232 *p
++ = (charset
< LEADING_CODE_EXT_12
233 ? LEADING_CODE_PRIVATE_11
234 : (charset
< LEADING_CODE_EXT_21
235 ? LEADING_CODE_PRIVATE_12
236 : (charset
< LEADING_CODE_EXT_22
237 ? LEADING_CODE_PRIVATE_21
238 : LEADING_CODE_PRIVATE_22
)));
240 if (c1
> 0 && c1
< 32 || c2
> 0 && c2
< 32)
241 invalid_character (c
);
250 invalid_character (c
);
255 /* Return the non-ASCII character corresponding to multi-byte form at
256 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
257 length of the multibyte form in *ACTUAL_LEN.
259 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
260 this function directly if you want ot handle ASCII characters as
264 string_to_char (str
, len
, actual_len
)
265 const unsigned char *str
;
266 int len
, *actual_len
;
268 int c
, bytes
, charset
, c1
, c2
;
270 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, charset
, c1
, c2
);
271 c
= MAKE_CHAR (charset
, c1
, c2
);
277 /* Return the length of the multi-byte form at string STR of length LEN.
278 Use the macro MULTIBYTE_FORM_LENGTH instead. */
280 multibyte_form_length (str
, len
)
281 const unsigned char *str
;
286 PARSE_MULTIBYTE_SEQ (str
, len
, bytes
);
290 /* Check multibyte form at string STR of length LEN and set variables
291 pointed by CHARSET, C1, and C2 to charset and position codes of the
292 character at STR, and return 0. If there's no multibyte character,
293 return -1. This should be used only in the macro SPLIT_STRING
294 which checks range of STR in advance. */
297 split_string (str
, len
, charset
, c1
, c2
)
298 const unsigned char *str
;
299 unsigned char *c1
, *c2
;
302 register int bytes
, cs
, code1
, code2
= -1;
304 SPLIT_MULTIBYTE_SEQ (str
, len
, bytes
, cs
, code1
, code2
);
305 if (cs
== CHARSET_ASCII
)
313 /* Return 1 iff character C has valid printable glyph.
314 Use the macro CHAR_PRINTABLE_P instead. */
321 if (ASCII_BYTE_P (c
))
323 else if (SINGLE_BYTE_CHAR_P (c
))
325 else if (c
>= MAX_CHAR
)
328 SPLIT_CHAR (c
, charset
, c1
, c2
);
329 if (! CHARSET_DEFINED_P (charset
))
331 if (CHARSET_CHARS (charset
) == 94
332 ? c1
<= 32 || c1
>= 127
335 if (CHARSET_DIMENSION (charset
) == 2
336 && (CHARSET_CHARS (charset
) == 94
337 ? c2
<= 32 || c2
>= 127
343 /* Translate character C by translation table TABLE. If C
344 is negative, translate a character specified by CHARSET, C1, and C2
345 (C1 and C2 are code points of the character). If no translation is
346 found in TABLE, return C. */
348 translate_char (table
, c
, charset
, c1
, c2
)
350 int c
, charset
, c1
, c2
;
353 int alt_charset
, alt_c1
, alt_c2
, dimension
;
355 if (c
< 0) c
= MAKE_CHAR (charset
, (c1
& 0x7F) , (c2
& 0x7F));
356 if (!CHAR_TABLE_P (table
)
357 || (ch
= Faref (table
, make_number (c
)), !NATNUMP (ch
)))
360 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
361 dimension
= CHARSET_DIMENSION (alt_charset
);
362 if (dimension
== 1 && alt_c1
> 0 || dimension
== 2 && alt_c2
> 0)
363 /* CH is not a generic character, just return it. */
364 return XFASTINT (ch
);
366 /* Since CH is a generic character, we must return a specific
367 charater which has the same position codes as C from CH. */
369 SPLIT_CHAR (c
, charset
, c1
, c2
);
370 if (dimension
!= CHARSET_DIMENSION (charset
))
371 /* We can't make such a character because of dimension mismatch. */
373 return MAKE_CHAR (alt_charset
, c1
, c2
);
376 /* Convert the unibyte character C to multibyte based on
377 Vnonascii_translation_table or nonascii_insert_offset. If they can't
378 convert C to a valid multibyte character, convert it based on
379 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
382 unibyte_char_to_multibyte (c
)
385 if (c
< 0400 && c
>= 0200)
389 if (! NILP (Vnonascii_translation_table
))
391 c
= XINT (Faref (Vnonascii_translation_table
, make_number (c
)));
392 if (c
>= 0400 && ! char_valid_p (c
, 0))
393 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
395 else if (c
>= 0240 && nonascii_insert_offset
> 0)
397 c
+= nonascii_insert_offset
;
398 if (c
< 0400 || ! char_valid_p (c
, 0))
399 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
402 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
408 /* Convert the multibyte character C to unibyte 8-bit character based
409 on Vnonascii_translation_table or nonascii_insert_offset. If
410 REV_TBL is non-nil, it should be a reverse table of
411 Vnonascii_translation_table, i.e. what given by:
412 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
415 multibyte_char_to_unibyte (c
, rev_tbl
)
419 if (!SINGLE_BYTE_CHAR_P (c
))
423 if (! CHAR_TABLE_P (rev_tbl
)
424 && CHAR_TABLE_P (Vnonascii_translation_table
))
425 rev_tbl
= Fchar_table_extra_slot (Vnonascii_translation_table
,
427 if (CHAR_TABLE_P (rev_tbl
))
430 temp
= Faref (rev_tbl
, make_number (c
));
434 c
= (c_save
& 0177) + 0200;
438 if (nonascii_insert_offset
> 0)
439 c
-= nonascii_insert_offset
;
440 if (c
< 128 || c
>= 256)
441 c
= (c_save
& 0177) + 0200;
449 /* Update the table Vcharset_table with the given arguments (see the
450 document of `define-charset' for the meaning of each argument).
451 Several other table contents are also updated. The caller should
452 check the validity of CHARSET-ID and the remaining arguments in
456 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
457 iso_final_char
, iso_graphic_plane
,
458 short_name
, long_name
, description
)
459 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
460 Lisp_Object iso_final_char
, iso_graphic_plane
;
461 Lisp_Object short_name
, long_name
, description
;
463 int charset
= XINT (charset_id
);
465 unsigned char leading_code_base
, leading_code_ext
;
467 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
468 CHARSET_TABLE_ENTRY (charset
)
469 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
471 if (NILP (long_name
))
472 long_name
= short_name
;
473 if (NILP (description
))
474 description
= long_name
;
476 /* Get byte length of multibyte form, base leading-code, and
477 extended leading-code of the charset. See the comment under the
478 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
479 bytes
= XINT (dimension
);
480 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
482 /* Official charset, it doesn't have an extended leading-code. */
483 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
)
484 bytes
+= 1; /* For a base leading-code. */
485 leading_code_base
= charset
;
486 leading_code_ext
= 0;
490 /* Private charset. */
491 bytes
+= 2; /* For base and extended leading-codes. */
493 = (charset
< LEADING_CODE_EXT_12
494 ? LEADING_CODE_PRIVATE_11
495 : (charset
< LEADING_CODE_EXT_21
496 ? LEADING_CODE_PRIVATE_12
497 : (charset
< LEADING_CODE_EXT_22
498 ? LEADING_CODE_PRIVATE_21
499 : LEADING_CODE_PRIVATE_22
)));
500 leading_code_ext
= charset
;
501 if (BYTES_BY_CHAR_HEAD (leading_code_base
) != bytes
)
502 error ("Invalid dimension for the charset-ID %d", charset
);
505 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
506 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
507 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
508 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
509 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
510 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
511 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
512 = make_number (leading_code_base
);
513 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
514 = make_number (leading_code_ext
);
515 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
516 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
518 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
519 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
520 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
521 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
531 for (i
= 0; i
<= MAX_CHARSET
; i
++)
532 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
534 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
535 && CHARSET_CHARS (i
) == XINT (chars
)
536 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
537 && CHARSET_DIRECTION (i
) != XINT (direction
))
539 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
541 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
551 if (charset
!= CHARSET_ASCII
&& charset
!= CHARSET_8_BIT_GRAPHIC
552 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
554 bytes_by_char_head
[leading_code_base
] = bytes
;
555 width_by_char_head
[leading_code_base
] = XINT (width
);
557 /* Update table emacs_code_class. */
558 emacs_code_class
[charset
] = (bytes
== 2
559 ? EMACS_leading_code_2
561 ? EMACS_leading_code_3
562 : EMACS_leading_code_4
));
565 /* Update table iso_charset_table. */
566 if (XINT (iso_final_char
) >= 0
567 && ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
568 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
573 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
576 get_charset_id (charset_symbol
)
577 Lisp_Object charset_symbol
;
582 return ((SYMBOLP (charset_symbol
)
583 && (val
= Fget (charset_symbol
, Qcharset
), VECTORP (val
))
584 && (charset
= XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
]),
585 CHARSET_VALID_P (charset
)))
589 /* Return an identification number for a new private charset of
590 DIMENSION and WIDTH. If there's no more room for the new charset,
593 get_new_private_charset_id (dimension
, width
)
594 int dimension
, width
;
596 int charset
, from
, to
;
600 from
= LEADING_CODE_EXT_11
;
601 to
= LEADING_CODE_EXT_21
;
605 from
= LEADING_CODE_EXT_21
;
606 to
= LEADING_CODE_EXT_MAX
+ 1;
609 for (charset
= from
; charset
< to
; charset
++)
610 if (!CHARSET_DEFINED_P (charset
)) break;
612 return make_number (charset
< to
? charset
: 0);
615 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
616 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
617 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
618 treated as a private charset.\n\
619 INFO-VECTOR is a vector of the format:\n\
620 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
621 SHORT-NAME LONG-NAME DESCRIPTION]\n\
622 The meanings of each elements is as follows:\n\
623 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
624 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
625 WIDTH (integer) is the number of columns a character in the charset\n\
626 occupies on the screen: one of 0, 1, and 2.\n\
628 DIRECTION (integer) is the rendering direction of characters in the\n\
629 charset when rendering. If 0, render from left to right, else\n\
630 render from right to left.\n\
632 ISO-FINAL-CHAR (character) is the final character of the\n\
633 corresponding ISO 2022 charset.\n\
634 It may be -1 if the charset is internal use only.\n\
636 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
637 while encoding to variants of ISO 2022 coding system, one of the\n\
638 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
639 It may be -1 if the charset is internal use only.\n\
641 SHORT-NAME (string) is the short name to refer to the charset.\n\
643 LONG-NAME (string) is the long name to refer to the charset.\n\
645 DESCRIPTION (string) is the description string of the charset.")
646 (charset_id
, charset_symbol
, info_vector
)
647 Lisp_Object charset_id
, charset_symbol
, info_vector
;
651 if (!NILP (charset_id
))
652 CHECK_NUMBER (charset_id
, 0);
653 CHECK_SYMBOL (charset_symbol
, 1);
654 CHECK_VECTOR (info_vector
, 2);
656 if (! NILP (charset_id
))
658 if (! CHARSET_VALID_P (XINT (charset_id
)))
659 error ("Invalid CHARSET: %d", XINT (charset_id
));
660 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
661 error ("Already defined charset: %d", XINT (charset_id
));
664 vec
= XVECTOR (info_vector
)->contents
;
665 if (XVECTOR (info_vector
)->size
!= 9
666 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
667 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
668 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
669 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
670 || !INTEGERP (vec
[4])
671 || !(XINT (vec
[4]) == -1 || XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~')
672 || !INTEGERP (vec
[5])
673 || !(XINT (vec
[5]) == -1 || XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
676 || !STRINGP (vec
[8]))
677 error ("Invalid info-vector argument for defining charset %s",
678 XSYMBOL (charset_symbol
)->name
->data
);
680 if (NILP (charset_id
))
682 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
683 if (XINT (charset_id
) == 0)
684 error ("There's no room for a new private charset %s",
685 XSYMBOL (charset_symbol
)->name
->data
);
688 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
689 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
690 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
691 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
692 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
696 DEFUN ("generic-character-list", Fgeneric_character_list
,
697 Sgeneric_character_list
, 0, 0, 0,
698 "Return a list of all possible generic characters.\n\
699 It includes a generic character for a charset not yet defined.")
702 return Vgeneric_character_list
;
705 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
706 Sget_unused_iso_final_char
, 2, 2, 0,
707 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
708 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
709 CHARS is the number of characters in a dimension: 94 or 96.\n\
711 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
712 If there's no unused final char for the specified kind of charset,\n\
715 Lisp_Object dimension
, chars
;
719 CHECK_NUMBER (dimension
, 0);
720 CHECK_NUMBER (chars
, 1);
721 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
722 error ("Invalid charset dimension %d, it should be 1 or 2",
724 if (XINT (chars
) != 94 && XINT (chars
) != 96)
725 error ("Invalid charset chars %d, it should be 94 or 96",
727 for (final_char
= '0'; final_char
<= '?'; final_char
++)
729 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
732 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
735 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
737 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
738 CHARSET should be defined by `defined-charset' in advance.")
739 (dimension
, chars
, final_char
, charset_symbol
)
740 Lisp_Object dimension
, chars
, final_char
, charset_symbol
;
744 CHECK_NUMBER (dimension
, 0);
745 CHECK_NUMBER (chars
, 1);
746 CHECK_NUMBER (final_char
, 2);
747 CHECK_SYMBOL (charset_symbol
, 3);
749 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
750 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
751 if (XINT (chars
) != 94 && XINT (chars
) != 96)
752 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
753 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
754 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
755 if ((charset
= get_charset_id (charset_symbol
)) < 0)
756 error ("Invalid charset %s", XSYMBOL (charset_symbol
)->name
->data
);
758 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset
;
762 /* Return information about charsets in the text at PTR of NBYTES
763 bytes, which are NCHARS characters. The value is:
765 0: Each character is represented by one byte. This is always
766 true for unibyte text.
767 1: No charsets other than ascii eight-bit-control,
768 eight-bit-graphic, and latin-1 are found.
771 In addition, if CHARSETS is nonzero, for each found charset N, set
772 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
773 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
774 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
775 1 (note that there's no charset whose ID is 1). */
778 find_charset_in_text (ptr
, nchars
, nbytes
, charsets
, table
)
780 int nchars
, nbytes
, *charsets
;
783 if (nchars
== nbytes
)
785 if (charsets
&& nbytes
> 0)
787 unsigned char *endp
= ptr
+ nbytes
;
790 while (ptr
< endp
&& maskbits
!= 7)
792 maskbits
|= (*ptr
< 0x80 ? 1 : *ptr
< 0xA0 ? 2 : 4);
797 charsets
[CHARSET_ASCII
] = 1;
799 charsets
[CHARSET_8_BIT_CONTROL
] = 1;
801 charsets
[CHARSET_8_BIT_GRAPHIC
] = 1;
808 int bytes
, charset
, c1
, c2
;
810 if (! CHAR_TABLE_P (table
))
815 SPLIT_MULTIBYTE_SEQ (ptr
, len
, bytes
, charset
, c1
, c2
);
818 if (!CHARSET_DEFINED_P (charset
))
820 else if (! NILP (table
))
822 int c
= translate_char (table
, -1, charset
, c1
, c2
);
824 charset
= CHAR_CHARSET (c
);
828 && charset
!= CHARSET_ASCII
829 && charset
!= CHARSET_8_BIT_CONTROL
830 && charset
!= CHARSET_8_BIT_GRAPHIC
831 && charset
!= charset_latin_iso8859_1
)
835 charsets
[charset
] = 1;
836 else if (return_val
== 2)
843 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
845 "Return a list of charsets in the region between BEG and END.\n\
846 BEG and END are buffer positions.\n\
847 Optional arg TABLE if non-nil is a translation table to look up.\n\
849 If the region contains invalid multibyte characters,\n\
850 `unknown' is included in the returned list.\n\
852 If the current buffer is unibyte, the returned list may contain\n\
853 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
855 Lisp_Object beg
, end
, table
;
857 int charsets
[MAX_CHARSET
+ 1];
858 int from
, from_byte
, to
, stop
, stop_byte
, i
;
861 validate_region (&beg
, &end
);
862 from
= XFASTINT (beg
);
863 stop
= to
= XFASTINT (end
);
865 if (from
< GPT
&& GPT
< to
)
868 stop_byte
= GPT_BYTE
;
871 stop_byte
= CHAR_TO_BYTE (stop
);
873 from_byte
= CHAR_TO_BYTE (from
);
875 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
878 find_charset_in_text (BYTE_POS_ADDR (from_byte
), stop
- from
,
879 stop_byte
- from_byte
, charsets
, table
);
882 from
= stop
, from_byte
= stop_byte
;
883 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
891 val
= Fcons (Qunknown
, val
);
892 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
894 val
= Fcons (CHARSET_SYMBOL (i
), val
);
896 val
= Fcons (Qascii
, val
);
900 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
902 "Return a list of charsets in STR.\n\
903 Optional arg TABLE if non-nil is a translation table to look up.\n\
905 If the string contains invalid multibyte characters,\n\
906 `unknown' is included in the returned list.\n\
908 If STR is unibyte, the returned list may contain\n\
909 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
911 Lisp_Object str
, table
;
913 int charsets
[MAX_CHARSET
+ 1];
917 CHECK_STRING (str
, 0);
919 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
920 find_charset_in_text (XSTRING (str
)->data
, XSTRING (str
)->size
,
921 STRING_BYTES (XSTRING (str
)), charsets
, table
);
925 val
= Fcons (Qunknown
, val
);
926 for (i
= MAX_CHARSET
; i
>= MIN_CHARSET_OFFICIAL_DIMENSION1
; i
--)
928 val
= Fcons (CHARSET_SYMBOL (i
), val
);
930 val
= Fcons (Qascii
, val
);
935 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
937 (charset
, code1
, code2
)
938 Lisp_Object charset
, code1
, code2
;
940 int charset_id
, c1
, c2
;
942 CHECK_NUMBER (charset
, 0);
943 charset_id
= XINT (charset
);
944 if (!CHARSET_DEFINED_P (charset_id
))
945 error ("Invalid charset ID: %d", XINT (charset
));
951 CHECK_NUMBER (code1
, 1);
958 CHECK_NUMBER (code2
, 2);
962 if (charset_id
== CHARSET_ASCII
)
964 if (c1
< 0 || c1
> 0x7F)
965 goto invalid_code_posints
;
966 return make_number (c1
);
968 else if (charset_id
== CHARSET_8_BIT_CONTROL
)
972 else if (c1
< 0x80 || c1
> 0x9F)
973 goto invalid_code_posints
;
974 return make_number (c1
);
976 else if (charset_id
== CHARSET_8_BIT_GRAPHIC
)
980 else if (c1
< 0xA0 || c1
> 0xFF)
981 goto invalid_code_posints
;
982 return make_number (c1
);
984 else if (c1
< 0 || c1
> 0xFF || c2
< 0 || c2
> 0xFF)
985 goto invalid_code_posints
;
991 ? !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, 0x20)
992 : !CHAR_COMPONENTS_VALID_P (charset_id
, c1
, c2
)))
993 goto invalid_code_posints
;
994 return make_number (MAKE_CHAR (charset_id
, c1
, c2
));
996 invalid_code_posints
:
997 error ("Invalid code points for charset ID %d: %d %d", charset_id
, c1
, c2
);
1000 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
1001 "Return list of charset and one or two position-codes of CHAR.\n\
1002 If CHAR is invalid as a character code,\n\
1003 return a list of symbol `unknown' and CHAR.")
1007 int c
, charset
, c1
, c2
;
1009 CHECK_NUMBER (ch
, 0);
1011 if (!CHAR_VALID_P (c
, 1))
1012 return Fcons (Qunknown
, Fcons (ch
, Qnil
));
1013 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
1015 ? Fcons (CHARSET_SYMBOL (charset
),
1016 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
1017 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
1020 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
1021 "Return charset of CHAR.")
1025 CHECK_NUMBER (ch
, 0);
1027 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
1030 DEFUN ("charset-after", Fcharset_after
, Scharset_after
, 0, 1, 0,
1031 "Return charset of a character in the current buffer at position POS.\n\
1032 If POS is nil, it defauls to the current point.\n\
1033 If POS is out of range, the value is nil.")
1040 ch
= Fchar_after (pos
);
1041 if (! INTEGERP (ch
))
1043 charset
= CHAR_CHARSET (XINT (ch
));
1044 return CHARSET_SYMBOL (charset
);
1047 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
1048 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1050 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1051 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1052 where as Emacs distinguishes them by charset symbol.\n\
1053 See the documentation of the function `charset-info' for the meanings of\n\
1054 DIMENSION, CHARS, and FINAL-CHAR.")
1055 (dimension
, chars
, final_char
)
1056 Lisp_Object dimension
, chars
, final_char
;
1060 CHECK_NUMBER (dimension
, 0);
1061 CHECK_NUMBER (chars
, 1);
1062 CHECK_NUMBER (final_char
, 2);
1064 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
1066 return CHARSET_SYMBOL (charset
);
1069 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1070 generic character. If GENERICP is zero, return nonzero iff C is a
1071 valid normal character. Do not call this function directly,
1072 instead use macro CHAR_VALID_P. */
1074 char_valid_p (c
, genericp
)
1077 int charset
, c1
, c2
;
1079 if (c
< 0 || c
>= MAX_CHAR
)
1081 if (SINGLE_BYTE_CHAR_P (c
))
1083 SPLIT_CHAR (c
, charset
, c1
, c2
);
1088 if (c2
<= 0) c2
= 0x20;
1092 if (c2
<= 0) c1
= c2
= 0x20;
1095 return (CHARSET_DEFINED_P (charset
)
1096 && CHAR_COMPONENTS_VALID_P (charset
, c1
, c2
));
1099 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
1100 "Return t if OBJECT is a valid normal character.\n\
1101 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1102 a valid generic character.")
1104 Lisp_Object object
, genericp
;
1106 if (! NATNUMP (object
))
1108 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
1111 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
1112 Sunibyte_char_to_multibyte
, 1, 1, 0,
1113 "Convert the unibyte character CH to multibyte character.\n\
1114 The conversion is done based on `nonascii-translation-table' (which see)\n\
1115 or `nonascii-insert-offset' (which see).")
1121 CHECK_NUMBER (ch
, 0);
1123 if (c
< 0 || c
>= 0400)
1124 error ("Invalid unibyte character: %d", c
);
1125 c
= unibyte_char_to_multibyte (c
);
1127 error ("Can't convert to multibyte character: %d", XINT (ch
));
1128 return make_number (c
);
1131 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
1132 Smultibyte_char_to_unibyte
, 1, 1, 0,
1133 "Convert the multibyte character CH to unibyte character.\n\
1134 The conversion is done based on `nonascii-translation-table' (which see)\n\
1135 or `nonascii-insert-offset' (which see).")
1141 CHECK_NUMBER (ch
, 0);
1143 if (! CHAR_VALID_P (c
, 0))
1144 error ("Invalid multibyte character: %d", c
);
1145 c
= multibyte_char_to_unibyte (c
, Qnil
);
1147 error ("Can't convert to unibyte character: %d", XINT (ch
));
1148 return make_number (c
);
1151 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
1152 "Return 1 regardless of the argument CHAR.\n\
1153 This is now an obsolete function. We keep it just for backward compatibility.")
1157 CHECK_NUMBER (ch
, 0);
1158 return make_number (1);
1161 /* Return how many bytes C will occupy in a multibyte buffer.
1162 Don't call this function directly, instead use macro CHAR_BYTES. */
1169 if (ASCII_BYTE_P (c
) || (c
& ~((1 << CHARACTERBITS
) -1)))
1171 if (SINGLE_BYTE_CHAR_P (c
) && c
>= 0xA0)
1174 charset
= CHAR_CHARSET (c
);
1175 return (CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1);
1178 /* Return the width of character of which multi-byte form starts with
1179 C. The width is measured by how many columns occupied on the
1180 screen when displayed in the current buffer. */
1182 #define ONE_BYTE_CHAR_WIDTH(c) \
1185 ? XFASTINT (current_buffer->tab_width) \
1186 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1190 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1191 : ((! NILP (current_buffer->enable_multibyte_characters) \
1192 && BASE_LEADING_CODE_P (c)) \
1193 ? WIDTH_BY_CHAR_HEAD (c) \
1196 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
1197 "Return width of CHAR when displayed in the current buffer.\n\
1198 The width is measured by how many columns it occupies on the screen.\n\
1199 Tab is taken to occupy `tab-width' columns.")
1203 Lisp_Object val
, disp
;
1205 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1207 CHECK_NUMBER (ch
, 0);
1211 /* Get the way the display table would display it. */
1212 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
1215 XSETINT (val
, XVECTOR (disp
)->size
);
1216 else if (SINGLE_BYTE_CHAR_P (c
))
1217 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
1220 int charset
= CHAR_CHARSET (c
);
1222 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
1227 /* Return width of string STR of length LEN when displayed in the
1228 current buffer. The width is measured by how many columns it
1229 occupies on the screen. */
1236 return c_string_width (str
, len
, -1, NULL
, NULL
);
1239 /* Return width of string STR of length LEN when displayed in the
1240 current buffer. The width is measured by how many columns it
1241 occupies on the screen. If PRECISION > 0, return the width of
1242 longest substring that doesn't exceed PRECISION, and set number of
1243 characters and bytes of the substring in *NCHARS and *NBYTES
1247 c_string_width (str
, len
, precision
, nchars
, nbytes
)
1249 int precision
, *nchars
, *nbytes
;
1251 int i
= 0, i_byte
= 0;
1254 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1256 while (i_byte
< len
)
1258 int bytes
, thiswidth
;
1263 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1266 val
= DISP_CHAR_VECTOR (dp
, c
);
1268 thiswidth
= XVECTOR (val
)->size
;
1270 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1275 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len
- i_byte
, bytes
);
1276 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1280 && (width
+ thiswidth
> precision
))
1300 /* Return width of Lisp string STRING when displayed in the current
1301 buffer. The width is measured by how many columns it occupies on
1302 the screen while paying attention to compositions. If PRECISION >
1303 0, return the width of longest substring that doesn't exceed
1304 PRECISION, and set number of characters and bytes of the substring
1305 in *NCHARS and *NBYTES respectively. */
1308 lisp_string_width (string
, precision
, nchars
, nbytes
)
1310 int precision
, *nchars
, *nbytes
;
1312 int len
= XSTRING (string
)->size
;
1313 int len_byte
= STRING_BYTES (XSTRING (string
));
1314 unsigned char *str
= XSTRING (string
)->data
;
1315 int i
= 0, i_byte
= 0;
1317 struct Lisp_Char_Table
*dp
= buffer_display_table ();
1321 int chars
, bytes
, thiswidth
;
1326 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
1327 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
1330 thiswidth
= composition_table
[cmp_id
]->width
;
1332 bytes
= string_char_to_byte (string
, end
) - i_byte
;
1336 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
1339 val
= DISP_CHAR_VECTOR (dp
, c
);
1341 thiswidth
= XVECTOR (val
)->size
;
1343 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1348 PARSE_MULTIBYTE_SEQ (str
+ i_byte
, len_byte
- i_byte
, bytes
);
1349 thiswidth
= ONE_BYTE_CHAR_WIDTH (str
[i_byte
]);
1353 && (width
+ thiswidth
> precision
))
1373 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1374 "Return width of STRING when displayed in the current buffer.\n\
1375 Width is measured by how many columns it occupies on the screen.\n\
1376 When calculating width of a multibyte character in STRING,\n\
1377 only the base leading-code is considered; the validity of\n\
1378 the following bytes is not checked. Tabs in STRING are always\n\
1379 taken to occupy `tab-width' columns.")
1385 CHECK_STRING (str
, 0);
1386 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
1390 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1391 "Return the direction of CHAR.\n\
1392 The returned value is 0 for left-to-right and 1 for right-to-left.")
1398 CHECK_NUMBER (ch
, 0);
1399 charset
= CHAR_CHARSET (XFASTINT (ch
));
1400 if (!CHARSET_DEFINED_P (charset
))
1401 invalid_character (XINT (ch
));
1402 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1405 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
1406 "Return number of characters between BEG and END.")
1408 Lisp_Object beg
, end
;
1412 CHECK_NUMBER_COERCE_MARKER (beg
, 0);
1413 CHECK_NUMBER_COERCE_MARKER (end
, 1);
1415 from
= min (XFASTINT (beg
), XFASTINT (end
));
1416 to
= max (XFASTINT (beg
), XFASTINT (end
));
1418 return make_number (to
- from
);
1421 /* Return the number of characters in the NBYTES bytes at PTR.
1422 This works by looking at the contents and checking for multibyte sequences.
1423 However, if the current buffer has enable-multibyte-characters = nil,
1424 we treat each byte as a character. */
1427 chars_in_text (ptr
, nbytes
)
1431 /* current_buffer is null at early stages of Emacs initialization. */
1432 if (current_buffer
== 0
1433 || NILP (current_buffer
->enable_multibyte_characters
))
1436 return multibyte_chars_in_text (ptr
, nbytes
);
1439 /* Return the number of characters in the NBYTES bytes at PTR.
1440 This works by looking at the contents and checking for multibyte sequences.
1441 It ignores enable-multibyte-characters. */
1444 multibyte_chars_in_text (ptr
, nbytes
)
1448 unsigned char *endp
;
1451 endp
= ptr
+ nbytes
;
1456 PARSE_MULTIBYTE_SEQ (ptr
, endp
- ptr
, bytes
);
1464 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1465 count the numbers of characters and bytes in it. On counting
1466 bytes, pay attention to the fact that 8-bit characters in the range
1467 0x80..0x9F are represented by 2 bytes in multibyte text. */
1469 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
1471 int len
, *nchars
, *nbytes
;
1473 unsigned char *endp
= str
+ len
;
1474 int n
, chars
= 0, bytes
= 0;
1478 if (UNIBYTE_STR_AS_MULTIBYTE_P (str
, endp
- str
, n
))
1479 str
+= n
, bytes
+= n
;
1489 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1490 It actually converts only 8-bit characters in the range 0x80..0x9F
1491 that don't contruct multibyte characters to multibyte forms. If
1492 NCHARS is nonzero, set *NCHARS to the number of characters in the
1493 text. It is assured that we can use LEN bytes at STR as a work
1494 area and that is enough. Return the number of bytes of the
1498 str_as_multibyte (str
, len
, nbytes
, nchars
)
1500 int len
, nbytes
, *nchars
;
1502 unsigned char *p
= str
, *endp
= str
+ nbytes
;
1507 while (p
< endp
&& UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1517 safe_bcopy (p
, endp
- nbytes
, nbytes
);
1521 if (UNIBYTE_STR_AS_MULTIBYTE_P (p
, endp
- p
, n
))
1528 *to
++ = LEADING_CODE_8_BIT_CONTROL
;
1529 *to
++ = *p
++ + 0x20;
1538 /* Parse unibyte string at STR of LEN bytes, and return the number of
1539 bytes it may ocupy when converted to multibyte string by
1540 `str_to_multibyte'. */
1543 parse_str_to_multibyte (str
, len
)
1547 unsigned char *endp
= str
+ len
;
1550 for (bytes
= 0; str
< endp
; str
++)
1551 bytes
+= (*str
< 0x80 || *str
>= 0xA0) ? 1 : 2;
1555 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1556 that contains the same single-byte characters. It actually
1557 converts all 8-bit characters to multibyte forms. It is assured
1558 that we can use LEN bytes at STR as a work area and that is
1562 str_to_multibyte (str
, len
, bytes
)
1566 unsigned char *p
= str
, *endp
= str
+ bytes
;
1569 while (p
< endp
&& (*p
< 0x80 || *p
>= 0xA0)) p
++;
1575 safe_bcopy (p
, endp
- bytes
, bytes
);
1579 if (*p
< 0x80 || *p
>= 0xA0)
1582 *to
++ = LEADING_CODE_8_BIT_CONTROL
, *to
++ = *p
++ + 0x20;
1587 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1588 actually converts only 8-bit characters in the range 0x80..0x9F to
1592 str_as_unibyte (str
, bytes
)
1596 unsigned char *p
= str
, *endp
= str
+ bytes
;
1597 unsigned char *to
= str
;
1599 while (p
< endp
&& *p
!= LEADING_CODE_8_BIT_CONTROL
) p
++;
1603 if (*p
== LEADING_CODE_8_BIT_CONTROL
)
1604 *to
++ = *(p
+ 1) - 0x20, p
+= 2;
1612 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
1613 "Concatenate all the argument characters and make the result a string.")
1619 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
1620 unsigned char *p
= buf
;
1624 for (i
= 0; i
< n
; i
++)
1626 CHECK_NUMBER (args
[i
], 0);
1627 if (!multibyte
&& !SINGLE_BYTE_CHAR_P (XFASTINT (args
[i
])))
1631 for (i
= 0; i
< n
; i
++)
1635 p
+= CHAR_STRING (c
, p
);
1640 return make_string_from_bytes (buf
, n
, p
- buf
);
1646 charset_id_internal (charset_name
)
1651 val
= Fget (intern (charset_name
), Qcharset
);
1653 error ("Charset %s is not defined", charset_name
);
1655 return (XINT (XVECTOR (val
)->contents
[0]));
1658 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1659 Ssetup_special_charsets
, 0, 0, 0, "Internal use only.")
1662 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1663 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1664 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1665 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1666 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1667 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1668 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1673 init_charset_once ()
1677 staticpro (&Vcharset_table
);
1678 staticpro (&Vcharset_symbol_table
);
1679 staticpro (&Vgeneric_character_list
);
1681 /* This has to be done here, before we call Fmake_char_table. */
1682 Qcharset_table
= intern ("charset-table");
1683 staticpro (&Qcharset_table
);
1685 /* Intern this now in case it isn't already done.
1686 Setting this variable twice is harmless.
1687 But don't staticpro it here--that is done in alloc.c. */
1688 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1690 /* Now we are ready to set up this property, so we can
1691 create the charset table. */
1692 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1693 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1695 Qunknown
= intern ("unknown");
1696 staticpro (&Qunknown
);
1697 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1),
1701 for (i
= 0; i
< 2; i
++)
1702 for (j
= 0; j
< 2; j
++)
1703 for (k
= 0; k
< 128; k
++)
1704 iso_charset_table
[i
][j
][k
] = -1;
1706 for (i
= 0; i
< 256; i
++)
1707 bytes_by_char_head
[i
] = 1;
1708 bytes_by_char_head
[LEADING_CODE_PRIVATE_11
] = 3;
1709 bytes_by_char_head
[LEADING_CODE_PRIVATE_12
] = 3;
1710 bytes_by_char_head
[LEADING_CODE_PRIVATE_21
] = 4;
1711 bytes_by_char_head
[LEADING_CODE_PRIVATE_22
] = 4;
1713 for (i
= 0; i
< 128; i
++)
1714 width_by_char_head
[i
] = 1;
1715 for (; i
< 256; i
++)
1716 width_by_char_head
[i
] = 4;
1717 width_by_char_head
[LEADING_CODE_PRIVATE_11
] = 1;
1718 width_by_char_head
[LEADING_CODE_PRIVATE_12
] = 2;
1719 width_by_char_head
[LEADING_CODE_PRIVATE_21
] = 1;
1720 width_by_char_head
[LEADING_CODE_PRIVATE_22
] = 2;
1726 for (i
= 0x81; i
< 0x90; i
++)
1727 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1728 for (; i
< 0x9A; i
++)
1729 val
= Fcons (make_number ((i
- 0x8F) << 14), val
);
1730 for (i
= 0xA0; i
< 0xF0; i
++)
1731 val
= Fcons (make_number ((i
- 0x70) << 7), val
);
1732 for (; i
< 0xFF; i
++)
1733 val
= Fcons (make_number ((i
- 0xE0) << 14), val
);
1734 Vgeneric_character_list
= Fnreverse (val
);
1737 nonascii_insert_offset
= 0;
1738 Vnonascii_translation_table
= Qnil
;
1746 Qcharset
= intern ("charset");
1747 staticpro (&Qcharset
);
1749 Qascii
= intern ("ascii");
1750 staticpro (&Qascii
);
1752 Qeight_bit_control
= intern ("eight-bit-control");
1753 staticpro (&Qeight_bit_control
);
1755 Qeight_bit_graphic
= intern ("eight-bit-graphic");
1756 staticpro (&Qeight_bit_graphic
);
1758 /* Define special charsets ascii, eight-bit-control, and
1759 eight-bit-graphic. */
1760 update_charset_table (make_number (CHARSET_ASCII
),
1761 make_number (1), make_number (94),
1766 build_string ("ASCII"),
1767 Qnil
, /* same as above */
1768 build_string ("ASCII (ISO646 IRV)"));
1769 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1770 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1772 update_charset_table (make_number (CHARSET_8_BIT_CONTROL
),
1773 make_number (1), make_number (96),
1778 build_string ("8-bit control code (0x80..0x9F)"),
1779 Qnil
, /* same as above */
1780 Qnil
); /* same as above */
1781 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL
) = Qeight_bit_control
;
1782 Fput (Qeight_bit_control
, Qcharset
,
1783 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL
));
1785 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC
),
1786 make_number (1), make_number (96),
1791 build_string ("8-bit graphic char (0xA0..0xFF)"),
1792 Qnil
, /* same as above */
1793 Qnil
); /* same as above */
1794 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC
) = Qeight_bit_graphic
;
1795 Fput (Qeight_bit_graphic
, Qcharset
,
1796 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC
));
1798 Qauto_fill_chars
= intern ("auto-fill-chars");
1799 staticpro (&Qauto_fill_chars
);
1800 Fput (Qauto_fill_chars
, Qchar_table_extra_slots
, make_number (0));
1802 defsubr (&Sdefine_charset
);
1803 defsubr (&Sgeneric_character_list
);
1804 defsubr (&Sget_unused_iso_final_char
);
1805 defsubr (&Sdeclare_equiv_charset
);
1806 defsubr (&Sfind_charset_region
);
1807 defsubr (&Sfind_charset_string
);
1808 defsubr (&Smake_char_internal
);
1809 defsubr (&Ssplit_char
);
1810 defsubr (&Schar_charset
);
1811 defsubr (&Scharset_after
);
1812 defsubr (&Siso_charset
);
1813 defsubr (&Schar_valid_p
);
1814 defsubr (&Sunibyte_char_to_multibyte
);
1815 defsubr (&Smultibyte_char_to_unibyte
);
1816 defsubr (&Schar_bytes
);
1817 defsubr (&Schar_width
);
1818 defsubr (&Sstring_width
);
1819 defsubr (&Schar_direction
);
1820 defsubr (&Schars_in_region
);
1822 defsubr (&Ssetup_special_charsets
);
1824 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1825 "List of charsets ever defined.");
1826 Vcharset_list
= Fcons (Qascii
, Fcons (Qeight_bit_control
,
1827 Fcons (Qeight_bit_graphic
, Qnil
)));
1829 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
1830 "Vector of cons cell of a symbol and translation table ever defined.\n\
1831 An ID of a translation table is an index of this vector.");
1832 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
1834 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1835 "Leading-code of private TYPE9N charset of column-width 1.");
1836 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1838 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1839 "Leading-code of private TYPE9N charset of column-width 2.");
1840 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1842 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1843 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1844 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1846 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1847 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1848 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1850 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1851 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1852 This is used for converting unibyte text to multibyte,\n\
1853 and for inserting character codes specified by number.\n\n\
1854 This serves to convert a Latin-1 or similar 8-bit character code\n\
1855 to the corresponding Emacs multibyte character code.\n\
1856 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1857 for your choice of character set.\n\
1858 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1859 nonascii_insert_offset
= 0;
1861 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table
,
1862 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1863 This is used for converting unibyte text to multibyte,\n\
1864 and for inserting character codes specified by number.\n\n\
1865 Conversion is performed only when multibyte characters are enabled,\n\
1866 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1867 to the corresponding Emacs character code.\n\n\
1868 If this is nil, `nonascii-insert-offset' is used instead.\n\
1869 See also the docstring of `make-translation-table'.");
1870 Vnonascii_translation_table
= Qnil
;
1872 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
1873 "A char-table for characters which invoke auto-filling.\n\
1874 Such characters have value t in this table.");
1875 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
1876 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
1877 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);