(window_scroll_pixel_based): Yet another int/Lisp_Object mixup (YAILOM).
[emacs.git] / src / charset.c
blob211de24ef8972a813fc4054d86510cca901a5e92
1 /* Basic multilingual character support.
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 1999, 2000, 2001
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 This file is part of GNU Emacs.
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
25 /* At first, see the document in `charset.h' to understand the code in
26 this file. */
28 #ifdef emacs
29 #include <config.h>
30 #endif
32 #include <stdio.h>
34 #ifdef emacs
36 #include <sys/types.h>
37 #include "lisp.h"
38 #include "buffer.h"
39 #include "charset.h"
40 #include "composite.h"
41 #include "coding.h"
42 #include "disptab.h"
44 #else /* not emacs */
46 #include "mulelib.h"
48 #endif /* emacs */
50 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
51 Lisp_Object Qunknown;
53 /* Declaration of special leading-codes. */
54 EMACS_INT leading_code_private_11; /* for private DIMENSION1 of 1-column */
55 EMACS_INT leading_code_private_12; /* for private DIMENSION1 of 2-column */
56 EMACS_INT leading_code_private_21; /* for private DIMENSION2 of 1-column */
57 EMACS_INT leading_code_private_22; /* for private DIMENSION2 of 2-column */
59 /* Declaration of special charsets. The values are set by
60 Fsetup_special_charsets. */
61 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
62 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
63 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
64 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
65 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
66 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
67 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
68 int charset_mule_unicode_0100_24ff;
69 int charset_mule_unicode_2500_33ff;
70 int charset_mule_unicode_e000_ffff;
72 Lisp_Object Qcharset_table;
74 /* A char-table containing information of each character set. */
75 Lisp_Object Vcharset_table;
77 /* A vector of charset symbol indexed by charset-id. This is used
78 only for returning charset symbol from C functions. */
79 Lisp_Object Vcharset_symbol_table;
81 /* A list of charset symbols ever defined. */
82 Lisp_Object Vcharset_list;
84 /* Vector of translation table ever defined.
85 ID of a translation table is used to index this vector. */
86 Lisp_Object Vtranslation_table_vector;
88 /* A char-table for characters which may invoke auto-filling. */
89 Lisp_Object Vauto_fill_chars;
91 Lisp_Object Qauto_fill_chars;
93 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
94 int bytes_by_char_head[256];
95 int width_by_char_head[256];
97 /* Mapping table from ISO2022's charset (specified by DIMENSION,
98 CHARS, and FINAL-CHAR) to Emacs' charset. */
99 int iso_charset_table[2][2][128];
101 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
102 unsigned char *_fetch_multibyte_char_p;
103 int _fetch_multibyte_char_len;
105 /* Offset to add to a non-ASCII value when inserting it. */
106 EMACS_INT nonascii_insert_offset;
108 /* Translation table for converting non-ASCII unibyte characters
109 to multibyte codes, or nil. */
110 Lisp_Object Vnonascii_translation_table;
112 /* List of all possible generic characters. */
113 Lisp_Object Vgeneric_character_list;
116 void
117 invalid_character (c)
118 int c;
120 error ("Invalid character: %d, #o%o, #x%x", c, c, c);
123 /* Parse string STR of length LENGTH and fetch information of a
124 character at STR. Set BYTES to the byte length the character
125 occupies, CHARSET, C1, C2 to proper values of the character. */
127 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
128 do { \
129 (c1) = *(str); \
130 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
131 if ((bytes) == 1) \
132 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
133 else if ((bytes) == 2) \
135 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
136 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
137 else \
138 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
140 else if ((bytes) == 3) \
142 if ((c1) < LEADING_CODE_PRIVATE_11) \
143 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
144 else \
145 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
147 else \
148 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
149 } while (0)
151 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
152 Note that this intentionally allows invalid components, such
153 as 0xA0 0xA0, because there exist many files that contain
154 such invalid byte sequences, especially in EUC-GB. */
155 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
156 ((charset) == CHARSET_ASCII \
157 ? ((c1) >= 0 && (c1) <= 0x7F) \
158 : ((charset) == CHARSET_8_BIT_CONTROL \
159 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
160 : ((charset) == CHARSET_8_BIT_GRAPHIC \
161 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
162 : (CHARSET_DIMENSION (charset) == 1 \
163 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
164 : ((c1) >= 0x20 && (c1) <= 0x7F \
165 && (c2) >= 0x20 && (c2) <= 0x7F)))))
167 /* Store multi-byte form of the character C in STR. The caller should
168 allocate at least 4-byte area at STR in advance. Returns the
169 length of the multi-byte form. If C is an invalid character code,
170 return -1. */
173 char_to_string_1 (c, str)
174 int c;
175 unsigned char *str;
177 unsigned char *p = str;
179 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
181 /* Multibyte character can't have a modifier bit. */
182 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
183 return -1;
185 /* For Meta, Shift, and Control modifiers, we need special care. */
186 if (c & CHAR_META)
188 /* Move the meta bit to the right place for a string. */
189 c = (c & ~CHAR_META) | 0x80;
191 if (c & CHAR_SHIFT)
193 /* Shift modifier is valid only with [A-Za-z]. */
194 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
195 c &= ~CHAR_SHIFT;
196 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
197 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
199 if (c & CHAR_CTL)
201 /* Simulate the code in lread.c. */
202 /* Allow `\C- ' and `\C-?'. */
203 if (c == (CHAR_CTL | ' '))
204 c = 0;
205 else if (c == (CHAR_CTL | '?'))
206 c = 127;
207 /* ASCII control chars are made from letters (both cases),
208 as well as the non-letters within 0100...0137. */
209 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
210 c &= (037 | (~0177 & ~CHAR_CTL));
211 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
212 c &= (037 | (~0177 & ~CHAR_CTL));
215 /* If C still has any modifier bits, just ignore it. */
216 c &= ~CHAR_MODIFIER_MASK;
219 if (SINGLE_BYTE_CHAR_P (c))
221 if (ASCII_BYTE_P (c) || c >= 0xA0)
222 *p++ = c;
223 else
225 *p++ = LEADING_CODE_8_BIT_CONTROL;
226 *p++ = c + 0x20;
229 else if (CHAR_VALID_P (c, 0))
231 int charset, c1, c2;
233 SPLIT_CHAR (c, charset, c1, c2);
235 if (charset >= LEADING_CODE_EXT_11)
236 *p++ = (charset < LEADING_CODE_EXT_12
237 ? LEADING_CODE_PRIVATE_11
238 : (charset < LEADING_CODE_EXT_21
239 ? LEADING_CODE_PRIVATE_12
240 : (charset < LEADING_CODE_EXT_22
241 ? LEADING_CODE_PRIVATE_21
242 : LEADING_CODE_PRIVATE_22)));
243 *p++ = charset;
244 if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32))
245 return -1;
246 if (c1)
248 *p++ = c1 | 0x80;
249 if (c2 > 0)
250 *p++ = c2 | 0x80;
253 else
254 return -1;
256 return (p - str);
260 /* Store multi-byte form of the character C in STR. The caller should
261 allocate at least 4-byte area at STR in advance. Returns the
262 length of the multi-byte form. If C is an invalid character code,
263 signal an error.
265 Use macro `CHAR_STRING (C, STR)' instead of calling this function
266 directly if C can be an ASCII character. */
269 char_to_string (c, str)
270 int c;
271 unsigned char *str;
273 int len;
274 len = char_to_string_1 (c, str);
275 if (len == -1)
276 invalid_character (c);
277 return len;
281 /* Return the non-ASCII character corresponding to multi-byte form at
282 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
283 length of the multibyte form in *ACTUAL_LEN.
285 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
286 this function directly if you want ot handle ASCII characters as
287 well. */
290 string_to_char (str, len, actual_len)
291 const unsigned char *str;
292 int len, *actual_len;
294 int c, bytes, charset, c1, c2;
296 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
297 c = MAKE_CHAR (charset, c1, c2);
298 if (actual_len)
299 *actual_len = bytes;
300 return c;
303 /* Return the length of the multi-byte form at string STR of length LEN.
304 Use the macro MULTIBYTE_FORM_LENGTH instead. */
306 multibyte_form_length (str, len)
307 const unsigned char *str;
308 int len;
310 int bytes;
312 PARSE_MULTIBYTE_SEQ (str, len, bytes);
313 return bytes;
316 /* Check multibyte form at string STR of length LEN and set variables
317 pointed by CHARSET, C1, and C2 to charset and position codes of the
318 character at STR, and return 0. If there's no multibyte character,
319 return -1. This should be used only in the macro SPLIT_STRING
320 which checks range of STR in advance. */
323 split_string (str, len, charset, c1, c2)
324 const unsigned char *str;
325 unsigned char *c1, *c2;
326 int len, *charset;
328 register int bytes, cs, code1, code2 = -1;
330 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
331 if (cs == CHARSET_ASCII)
332 return -1;
333 *charset = cs;
334 *c1 = code1;
335 *c2 = code2;
336 return 0;
339 /* Return 1 iff character C has valid printable glyph.
340 Use the macro CHAR_PRINTABLE_P instead. */
342 char_printable_p (c)
343 int c;
345 int charset, c1, c2;
347 if (ASCII_BYTE_P (c))
348 return 1;
349 else if (SINGLE_BYTE_CHAR_P (c))
350 return 0;
351 else if (c >= MAX_CHAR)
352 return 0;
354 SPLIT_CHAR (c, charset, c1, c2);
355 if (! CHARSET_DEFINED_P (charset))
356 return 0;
357 if (CHARSET_CHARS (charset) == 94
358 ? c1 <= 32 || c1 >= 127
359 : c1 < 32)
360 return 0;
361 if (CHARSET_DIMENSION (charset) == 2
362 && (CHARSET_CHARS (charset) == 94
363 ? c2 <= 32 || c2 >= 127
364 : c2 < 32))
365 return 0;
366 return 1;
369 /* Translate character C by translation table TABLE. If C
370 is negative, translate a character specified by CHARSET, C1, and C2
371 (C1 and C2 are code points of the character). If no translation is
372 found in TABLE, return C. */
374 translate_char (table, c, charset, c1, c2)
375 Lisp_Object table;
376 int c, charset, c1, c2;
378 Lisp_Object ch;
379 int alt_charset, alt_c1, alt_c2, dimension;
381 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
382 if (!CHAR_TABLE_P (table)
383 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
384 return c;
386 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
387 dimension = CHARSET_DIMENSION (alt_charset);
388 if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0))
389 /* CH is not a generic character, just return it. */
390 return XFASTINT (ch);
392 /* Since CH is a generic character, we must return a specific
393 charater which has the same position codes as C from CH. */
394 if (charset < 0)
395 SPLIT_CHAR (c, charset, c1, c2);
396 if (dimension != CHARSET_DIMENSION (charset))
397 /* We can't make such a character because of dimension mismatch. */
398 return c;
399 return MAKE_CHAR (alt_charset, c1, c2);
402 /* Convert the unibyte character C to multibyte based on
403 Vnonascii_translation_table or nonascii_insert_offset. If they can't
404 convert C to a valid multibyte character, convert it based on
405 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
408 unibyte_char_to_multibyte (c)
409 int c;
411 if (c < 0400 && c >= 0200)
413 int c_save = c;
415 if (! NILP (Vnonascii_translation_table))
417 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
418 if (c >= 0400 && ! char_valid_p (c, 0))
419 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
421 else if (c >= 0240 && nonascii_insert_offset > 0)
423 c += nonascii_insert_offset;
424 if (c < 0400 || ! char_valid_p (c, 0))
425 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
427 else if (c >= 0240)
428 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
430 return c;
434 /* Convert the multibyte character C to unibyte 8-bit character based
435 on Vnonascii_translation_table or nonascii_insert_offset. If
436 REV_TBL is non-nil, it should be a reverse table of
437 Vnonascii_translation_table, i.e. what given by:
438 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
441 multibyte_char_to_unibyte (c, rev_tbl)
442 int c;
443 Lisp_Object rev_tbl;
445 if (!SINGLE_BYTE_CHAR_P (c))
447 int c_save = c;
449 if (! CHAR_TABLE_P (rev_tbl)
450 && CHAR_TABLE_P (Vnonascii_translation_table))
451 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
452 make_number (0));
453 if (CHAR_TABLE_P (rev_tbl))
455 Lisp_Object temp;
456 temp = Faref (rev_tbl, make_number (c));
457 if (INTEGERP (temp))
458 c = XINT (temp);
459 if (c >= 256)
460 c = (c_save & 0177) + 0200;
462 else
464 if (nonascii_insert_offset > 0)
465 c -= nonascii_insert_offset;
466 if (c < 128 || c >= 256)
467 c = (c_save & 0177) + 0200;
471 return c;
475 /* Update the table Vcharset_table with the given arguments (see the
476 document of `define-charset' for the meaning of each argument).
477 Several other table contents are also updated. The caller should
478 check the validity of CHARSET-ID and the remaining arguments in
479 advance. */
481 void
482 update_charset_table (charset_id, dimension, chars, width, direction,
483 iso_final_char, iso_graphic_plane,
484 short_name, long_name, description)
485 Lisp_Object charset_id, dimension, chars, width, direction;
486 Lisp_Object iso_final_char, iso_graphic_plane;
487 Lisp_Object short_name, long_name, description;
489 int charset = XINT (charset_id);
490 int bytes;
491 unsigned char leading_code_base, leading_code_ext;
493 if (NILP (CHARSET_TABLE_ENTRY (charset)))
494 CHARSET_TABLE_ENTRY (charset)
495 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
497 if (NILP (long_name))
498 long_name = short_name;
499 if (NILP (description))
500 description = long_name;
502 /* Get byte length of multibyte form, base leading-code, and
503 extended leading-code of the charset. See the comment under the
504 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
505 bytes = XINT (dimension);
506 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
508 /* Official charset, it doesn't have an extended leading-code. */
509 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
510 bytes += 1; /* For a base leading-code. */
511 leading_code_base = charset;
512 leading_code_ext = 0;
514 else
516 /* Private charset. */
517 bytes += 2; /* For base and extended leading-codes. */
518 leading_code_base
519 = (charset < LEADING_CODE_EXT_12
520 ? LEADING_CODE_PRIVATE_11
521 : (charset < LEADING_CODE_EXT_21
522 ? LEADING_CODE_PRIVATE_12
523 : (charset < LEADING_CODE_EXT_22
524 ? LEADING_CODE_PRIVATE_21
525 : LEADING_CODE_PRIVATE_22)));
526 leading_code_ext = charset;
527 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
528 error ("Invalid dimension for the charset-ID %d", charset);
531 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
532 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
533 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
534 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
535 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
536 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
537 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
538 = make_number (leading_code_base);
539 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
540 = make_number (leading_code_ext);
541 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
542 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
543 = iso_graphic_plane;
544 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
545 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
546 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
547 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
550 /* If we have already defined a charset which has the same
551 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
552 DIRECTION, we must update the entry REVERSE-CHARSET of both
553 charsets. If there's no such charset, the value of the entry
554 is set to nil. */
555 int i;
557 for (i = 0; i <= MAX_CHARSET; i++)
558 if (!NILP (CHARSET_TABLE_ENTRY (i)))
560 if (CHARSET_DIMENSION (i) == XINT (dimension)
561 && CHARSET_CHARS (i) == XINT (chars)
562 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
563 && CHARSET_DIRECTION (i) != XINT (direction))
565 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
566 = make_number (i);
567 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
568 break;
571 if (i > MAX_CHARSET)
572 /* No such a charset. */
573 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
574 = make_number (-1);
577 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
578 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
580 bytes_by_char_head[leading_code_base] = bytes;
581 width_by_char_head[leading_code_base] = XINT (width);
583 /* Update table emacs_code_class. */
584 emacs_code_class[charset] = (bytes == 2
585 ? EMACS_leading_code_2
586 : (bytes == 3
587 ? EMACS_leading_code_3
588 : EMACS_leading_code_4));
591 /* Update table iso_charset_table. */
592 if (XINT (iso_final_char) >= 0
593 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
594 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
597 #ifdef emacs
599 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
600 is invalid. */
602 get_charset_id (charset_symbol)
603 Lisp_Object charset_symbol;
605 Lisp_Object val;
606 int charset;
608 /* This originally used a ?: operator, but reportedly the HP-UX
609 compiler version HP92453-01 A.10.32.22 miscompiles that. */
610 if (SYMBOLP (charset_symbol)
611 && VECTORP (val = Fget (charset_symbol, Qcharset))
612 && CHARSET_VALID_P (charset =
613 XINT (XVECTOR (val)->contents[CHARSET_ID_IDX])))
614 return charset;
615 else
616 return -1;
619 /* Return an identification number for a new private charset of
620 DIMENSION and WIDTH. If there's no more room for the new charset,
621 return 0. */
622 Lisp_Object
623 get_new_private_charset_id (dimension, width)
624 int dimension, width;
626 int charset, from, to;
628 if (dimension == 1)
630 from = LEADING_CODE_EXT_11;
631 to = LEADING_CODE_EXT_21;
633 else
635 from = LEADING_CODE_EXT_21;
636 to = LEADING_CODE_EXT_MAX + 1;
639 for (charset = from; charset < to; charset++)
640 if (!CHARSET_DEFINED_P (charset)) break;
642 return make_number (charset < to ? charset : 0);
645 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
646 doc: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
647 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
648 treated as a private charset.
649 INFO-VECTOR is a vector of the format:
650 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
651 SHORT-NAME LONG-NAME DESCRIPTION]
652 The meanings of each elements is as follows:
653 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
654 CHARS (integer) is the number of characters in a dimension: 94 or 96.
655 WIDTH (integer) is the number of columns a character in the charset
656 occupies on the screen: one of 0, 1, and 2.
658 DIRECTION (integer) is the rendering direction of characters in the
659 charset when rendering. If 0, render from left to right, else
660 render from right to left.
662 ISO-FINAL-CHAR (character) is the final character of the
663 corresponding ISO 2022 charset.
664 It may be -1 if the charset is internal use only.
666 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
667 while encoding to variants of ISO 2022 coding system, one of the
668 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
669 It may be -1 if the charset is internal use only.
671 SHORT-NAME (string) is the short name to refer to the charset.
673 LONG-NAME (string) is the long name to refer to the charset.
675 DESCRIPTION (string) is the description string of the charset. */)
676 (charset_id, charset_symbol, info_vector)
677 Lisp_Object charset_id, charset_symbol, info_vector;
679 Lisp_Object *vec;
681 if (!NILP (charset_id))
682 CHECK_NUMBER (charset_id);
683 CHECK_SYMBOL (charset_symbol);
684 CHECK_VECTOR (info_vector);
686 if (! NILP (charset_id))
688 if (! CHARSET_VALID_P (XINT (charset_id)))
689 error ("Invalid CHARSET: %d", XINT (charset_id));
690 else if (CHARSET_DEFINED_P (XINT (charset_id)))
691 error ("Already defined charset: %d", XINT (charset_id));
694 vec = XVECTOR (info_vector)->contents;
695 if (XVECTOR (info_vector)->size != 9
696 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
697 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
698 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
699 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
700 || !INTEGERP (vec[4])
701 || !(XINT (vec[4]) == -1 || (XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~'))
702 || !INTEGERP (vec[5])
703 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
704 || !STRINGP (vec[6])
705 || !STRINGP (vec[7])
706 || !STRINGP (vec[8]))
707 error ("Invalid info-vector argument for defining charset %s",
708 SDATA (SYMBOL_NAME (charset_symbol)));
710 if (NILP (charset_id))
712 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
713 if (XINT (charset_id) == 0)
714 error ("There's no room for a new private charset %s",
715 SDATA (SYMBOL_NAME (charset_symbol)));
718 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
719 vec[4], vec[5], vec[6], vec[7], vec[8]);
720 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
721 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
722 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
723 Fupdate_coding_systems_internal ();
724 return Qnil;
727 DEFUN ("generic-character-list", Fgeneric_character_list,
728 Sgeneric_character_list, 0, 0, 0,
729 doc: /* Return a list of all possible generic characters.
730 It includes a generic character for a charset not yet defined. */)
733 return Vgeneric_character_list;
736 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
737 Sget_unused_iso_final_char, 2, 2, 0,
738 doc: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
739 DIMENSION is the number of bytes to represent a character: 1 or 2.
740 CHARS is the number of characters in a dimension: 94 or 96.
742 This final char is for private use, thus the range is `0' (48) .. `?' (63).
743 If there's no unused final char for the specified kind of charset,
744 return nil. */)
745 (dimension, chars)
746 Lisp_Object dimension, chars;
748 int final_char;
750 CHECK_NUMBER (dimension);
751 CHECK_NUMBER (chars);
752 if (XINT (dimension) != 1 && XINT (dimension) != 2)
753 error ("Invalid charset dimension %d, it should be 1 or 2",
754 XINT (dimension));
755 if (XINT (chars) != 94 && XINT (chars) != 96)
756 error ("Invalid charset chars %d, it should be 94 or 96",
757 XINT (chars));
758 for (final_char = '0'; final_char <= '?'; final_char++)
760 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
761 break;
763 return (final_char <= '?' ? make_number (final_char) : Qnil);
766 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
767 4, 4, 0,
768 doc: /* Declare an equivalent charset for ISO-2022 decoding.
770 On decoding by an ISO-2022 base coding system, when a charset
771 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
772 if CHARSET is designated instead. */)
773 (dimension, chars, final_char, charset)
774 Lisp_Object dimension, chars, final_char, charset;
776 int charset_id;
778 CHECK_NUMBER (dimension);
779 CHECK_NUMBER (chars);
780 CHECK_NUMBER (final_char);
781 CHECK_SYMBOL (charset);
783 if (XINT (dimension) != 1 && XINT (dimension) != 2)
784 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
785 if (XINT (chars) != 94 && XINT (chars) != 96)
786 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
787 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
788 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
789 if ((charset_id = get_charset_id (charset)) < 0)
790 error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset)));
792 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset_id;
793 return Qnil;
796 /* Return information about charsets in the text at PTR of NBYTES
797 bytes, which are NCHARS characters. The value is:
799 0: Each character is represented by one byte. This is always
800 true for unibyte text.
801 1: No charsets other than ascii eight-bit-control,
802 eight-bit-graphic, and latin-1 are found.
803 2: Otherwise.
805 In addition, if CHARSETS is nonzero, for each found charset N, set
806 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
807 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
808 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
809 1 (note that there's no charset whose ID is 1). */
812 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
813 const unsigned char *ptr;
814 int nchars, nbytes, *charsets;
815 Lisp_Object table;
817 if (nchars == nbytes)
819 if (charsets && nbytes > 0)
821 const unsigned char *endp = ptr + nbytes;
822 int maskbits = 0;
824 while (ptr < endp && maskbits != 7)
826 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
827 ptr++;
830 if (maskbits & 1)
831 charsets[CHARSET_ASCII] = 1;
832 if (maskbits & 2)
833 charsets[CHARSET_8_BIT_CONTROL] = 1;
834 if (maskbits & 4)
835 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
837 return 0;
839 else
841 int return_val = 1;
842 int bytes, charset, c1, c2;
844 if (! CHAR_TABLE_P (table))
845 table = Qnil;
847 while (nchars-- > 0)
849 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
850 ptr += bytes;
852 if (!CHARSET_DEFINED_P (charset))
853 charset = 1;
854 else if (! NILP (table))
856 int c = translate_char (table, -1, charset, c1, c2);
857 if (c >= 0)
858 charset = CHAR_CHARSET (c);
861 if (return_val == 1
862 && charset != CHARSET_ASCII
863 && charset != CHARSET_8_BIT_CONTROL
864 && charset != CHARSET_8_BIT_GRAPHIC
865 && charset != charset_latin_iso8859_1)
866 return_val = 2;
868 if (charsets)
869 charsets[charset] = 1;
870 else if (return_val == 2)
871 break;
873 return return_val;
877 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
878 2, 3, 0,
879 doc: /* Return a list of charsets in the region between BEG and END.
880 BEG and END are buffer positions.
881 Optional arg TABLE if non-nil is a translation table to look up.
883 If the region contains invalid multibyte characters,
884 `unknown' is included in the returned list.
886 If the current buffer is unibyte, the returned list may contain
887 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
888 (beg, end, table)
889 Lisp_Object beg, end, table;
891 int charsets[MAX_CHARSET + 1];
892 int from, from_byte, to, stop, stop_byte, i;
893 Lisp_Object val;
895 validate_region (&beg, &end);
896 from = XFASTINT (beg);
897 stop = to = XFASTINT (end);
899 if (from < GPT && GPT < to)
901 stop = GPT;
902 stop_byte = GPT_BYTE;
904 else
905 stop_byte = CHAR_TO_BYTE (stop);
907 from_byte = CHAR_TO_BYTE (from);
909 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
910 while (1)
912 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
913 stop_byte - from_byte, charsets, table);
914 if (stop < to)
916 from = stop, from_byte = stop_byte;
917 stop = to, stop_byte = CHAR_TO_BYTE (stop);
919 else
920 break;
923 val = Qnil;
924 if (charsets[1])
925 val = Fcons (Qunknown, val);
926 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
927 if (charsets[i])
928 val = Fcons (CHARSET_SYMBOL (i), val);
929 if (charsets[0])
930 val = Fcons (Qascii, val);
931 return val;
934 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
935 1, 2, 0,
936 doc: /* Return a list of charsets in STR.
937 Optional arg TABLE if non-nil is a translation table to look up.
939 If the string contains invalid multibyte characters,
940 `unknown' is included in the returned list.
942 If STR is unibyte, the returned list may contain
943 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
944 (str, table)
945 Lisp_Object str, table;
947 int charsets[MAX_CHARSET + 1];
948 int i;
949 Lisp_Object val;
951 CHECK_STRING (str);
953 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
954 find_charset_in_text (SDATA (str), SCHARS (str),
955 SBYTES (str), charsets, table);
957 val = Qnil;
958 if (charsets[1])
959 val = Fcons (Qunknown, val);
960 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
961 if (charsets[i])
962 val = Fcons (CHARSET_SYMBOL (i), val);
963 if (charsets[0])
964 val = Fcons (Qascii, val);
965 return val;
969 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
970 doc: /* Return a character made from arguments.
971 Internal use only. */)
972 (charset, code1, code2)
973 Lisp_Object charset, code1, code2;
975 int charset_id, c1, c2;
977 CHECK_NUMBER (charset);
978 charset_id = XINT (charset);
979 if (!CHARSET_DEFINED_P (charset_id))
980 error ("Invalid charset ID: %d", XINT (charset));
982 if (NILP (code1))
983 c1 = 0;
984 else
986 CHECK_NUMBER (code1);
987 c1 = XINT (code1);
989 if (NILP (code2))
990 c2 = 0;
991 else
993 CHECK_NUMBER (code2);
994 c2 = XINT (code2);
997 if (charset_id == CHARSET_ASCII)
999 if (c1 < 0 || c1 > 0x7F)
1000 goto invalid_code_posints;
1001 return make_number (c1);
1003 else if (charset_id == CHARSET_8_BIT_CONTROL)
1005 if (NILP (code1))
1006 c1 = 0x80;
1007 else if (c1 < 0x80 || c1 > 0x9F)
1008 goto invalid_code_posints;
1009 return make_number (c1);
1011 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
1013 if (NILP (code1))
1014 c1 = 0xA0;
1015 else if (c1 < 0xA0 || c1 > 0xFF)
1016 goto invalid_code_posints;
1017 return make_number (c1);
1019 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1020 goto invalid_code_posints;
1021 c1 &= 0x7F;
1022 c2 &= 0x7F;
1023 if (c1 == 0
1024 ? c2 != 0
1025 : (c2 == 0
1026 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
1027 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
1028 goto invalid_code_posints;
1029 return make_number (MAKE_CHAR (charset_id, c1, c2));
1031 invalid_code_posints:
1032 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1035 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1036 doc: /* Return list of charset and one or two position-codes of CH.
1037 If CH is invalid as a character code,
1038 return a list of symbol `unknown' and CH. */)
1039 (ch)
1040 Lisp_Object ch;
1042 int c, charset, c1, c2;
1044 CHECK_NUMBER (ch);
1045 c = XFASTINT (ch);
1046 if (!CHAR_VALID_P (c, 1))
1047 return Fcons (Qunknown, Fcons (ch, Qnil));
1048 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1049 return (c2 >= 0
1050 ? Fcons (CHARSET_SYMBOL (charset),
1051 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1052 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1055 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1056 doc: /* Return charset of CH. */)
1057 (ch)
1058 Lisp_Object ch;
1060 CHECK_NUMBER (ch);
1062 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1065 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1066 doc: /* Return charset of a character in the current buffer at position POS.
1067 If POS is nil, it defauls to the current point.
1068 If POS is out of range, the value is nil. */)
1069 (pos)
1070 Lisp_Object pos;
1072 Lisp_Object ch;
1073 int charset;
1075 ch = Fchar_after (pos);
1076 if (! INTEGERP (ch))
1077 return ch;
1078 charset = CHAR_CHARSET (XINT (ch));
1079 return CHARSET_SYMBOL (charset);
1082 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1083 doc: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1085 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1086 by their DIMENSION, CHARS, and FINAL-CHAR,
1087 where as Emacs distinguishes them by charset symbol.
1088 See the documentation of the function `charset-info' for the meanings of
1089 DIMENSION, CHARS, and FINAL-CHAR. */)
1090 (dimension, chars, final_char)
1091 Lisp_Object dimension, chars, final_char;
1093 int charset;
1095 CHECK_NUMBER (dimension);
1096 CHECK_NUMBER (chars);
1097 CHECK_NUMBER (final_char);
1099 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1100 return Qnil;
1101 return CHARSET_SYMBOL (charset);
1104 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1105 generic character. If GENERICP is zero, return nonzero iff C is a
1106 valid normal character. Do not call this function directly,
1107 instead use macro CHAR_VALID_P. */
1109 char_valid_p (c, genericp)
1110 int c, genericp;
1112 int charset, c1, c2;
1114 if (c < 0 || c >= MAX_CHAR)
1115 return 0;
1116 if (SINGLE_BYTE_CHAR_P (c))
1117 return 1;
1118 SPLIT_CHAR (c, charset, c1, c2);
1119 if (genericp)
1121 if (c1)
1123 if (c2 <= 0) c2 = 0x20;
1125 else
1127 if (c2 <= 0) c1 = c2 = 0x20;
1130 return (CHARSET_DEFINED_P (charset)
1131 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1134 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1135 doc: /* Return t if OBJECT is a valid normal character.
1136 If optional arg GENERICP is non-nil, also return t if OBJECT is
1137 a valid generic character. */)
1138 (object, genericp)
1139 Lisp_Object object, genericp;
1141 if (! NATNUMP (object))
1142 return Qnil;
1143 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1146 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1147 Sunibyte_char_to_multibyte, 1, 1, 0,
1148 doc: /* Convert the unibyte character CH to multibyte character.
1149 The conversion is done based on `nonascii-translation-table' (which see)
1150 or `nonascii-insert-offset' (which see). */)
1151 (ch)
1152 Lisp_Object ch;
1154 int c;
1156 CHECK_NUMBER (ch);
1157 c = XINT (ch);
1158 if (c < 0 || c >= 0400)
1159 error ("Invalid unibyte character: %d", c);
1160 c = unibyte_char_to_multibyte (c);
1161 if (c < 0)
1162 error ("Can't convert to multibyte character: %d", XINT (ch));
1163 return make_number (c);
1166 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1167 Smultibyte_char_to_unibyte, 1, 1, 0,
1168 doc: /* Convert the multibyte character CH to unibyte character.
1169 The conversion is done based on `nonascii-translation-table' (which see)
1170 or `nonascii-insert-offset' (which see). */)
1171 (ch)
1172 Lisp_Object ch;
1174 int c;
1176 CHECK_NUMBER (ch);
1177 c = XINT (ch);
1178 if (! CHAR_VALID_P (c, 0))
1179 error ("Invalid multibyte character: %d", c);
1180 c = multibyte_char_to_unibyte (c, Qnil);
1181 if (c < 0)
1182 error ("Can't convert to unibyte character: %d", XINT (ch));
1183 return make_number (c);
1186 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1187 doc: /* Return 1 regardless of the argument CH. */)
1188 (ch)
1189 Lisp_Object ch;
1191 CHECK_NUMBER (ch);
1192 return make_number (1);
1195 /* Return how many bytes C will occupy in a multibyte buffer.
1196 Don't call this function directly, instead use macro CHAR_BYTES. */
1198 char_bytes (c)
1199 int c;
1201 int charset;
1203 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1204 return 1;
1205 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1206 return 1;
1208 charset = CHAR_CHARSET (c);
1209 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1212 /* Return the width of character of which multi-byte form starts with
1213 C. The width is measured by how many columns occupied on the
1214 screen when displayed in the current buffer. */
1216 #define ONE_BYTE_CHAR_WIDTH(c) \
1217 (c < 0x20 \
1218 ? (c == '\t' \
1219 ? XFASTINT (current_buffer->tab_width) \
1220 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1221 : (c < 0x7f \
1222 ? 1 \
1223 : (c == 0x7F \
1224 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1225 : ((! NILP (current_buffer->enable_multibyte_characters) \
1226 && BASE_LEADING_CODE_P (c)) \
1227 ? WIDTH_BY_CHAR_HEAD (c) \
1228 : 4))))
1230 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1231 doc: /* Return width of CH when displayed in the current buffer.
1232 The width is measured by how many columns it occupies on the screen.
1233 Tab is taken to occupy `tab-width' columns. */)
1234 (ch)
1235 Lisp_Object ch;
1237 Lisp_Object val, disp;
1238 int c;
1239 struct Lisp_Char_Table *dp = buffer_display_table ();
1241 CHECK_NUMBER (ch);
1243 c = XINT (ch);
1245 /* Get the way the display table would display it. */
1246 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1248 if (VECTORP (disp))
1249 XSETINT (val, XVECTOR (disp)->size);
1250 else if (SINGLE_BYTE_CHAR_P (c))
1251 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1252 else
1254 int charset = CHAR_CHARSET (c);
1256 XSETFASTINT (val, CHARSET_WIDTH (charset));
1258 return val;
1261 /* Return width of string STR of length LEN when displayed in the
1262 current buffer. The width is measured by how many columns it
1263 occupies on the screen. */
1266 strwidth (str, len)
1267 unsigned char *str;
1268 int len;
1270 return c_string_width (str, len, -1, NULL, NULL);
1273 /* Return width of string STR of length LEN when displayed in the
1274 current buffer. The width is measured by how many columns it
1275 occupies on the screen. If PRECISION > 0, return the width of
1276 longest substring that doesn't exceed PRECISION, and set number of
1277 characters and bytes of the substring in *NCHARS and *NBYTES
1278 respectively. */
1281 c_string_width (str, len, precision, nchars, nbytes)
1282 const unsigned char *str;
1283 int len, precision, *nchars, *nbytes;
1285 int i = 0, i_byte = 0;
1286 int width = 0;
1287 int chars;
1288 struct Lisp_Char_Table *dp = buffer_display_table ();
1290 while (i_byte < len)
1292 int bytes, thiswidth;
1293 Lisp_Object val;
1295 if (dp)
1297 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1299 chars = 1;
1300 val = DISP_CHAR_VECTOR (dp, c);
1301 if (VECTORP (val))
1302 thiswidth = XVECTOR (val)->size;
1303 else
1304 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1306 else
1308 chars = 1;
1309 PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes);
1310 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1313 if (precision > 0
1314 && (width + thiswidth > precision))
1316 *nchars = i;
1317 *nbytes = i_byte;
1318 return width;
1320 i++;
1321 i_byte += bytes;
1322 width += thiswidth;
1325 if (precision > 0)
1327 *nchars = i;
1328 *nbytes = i_byte;
1331 return width;
1334 /* Return width of Lisp string STRING when displayed in the current
1335 buffer. The width is measured by how many columns it occupies on
1336 the screen while paying attention to compositions. If PRECISION >
1337 0, return the width of longest substring that doesn't exceed
1338 PRECISION, and set number of characters and bytes of the substring
1339 in *NCHARS and *NBYTES respectively. */
1342 lisp_string_width (string, precision, nchars, nbytes)
1343 Lisp_Object string;
1344 int precision, *nchars, *nbytes;
1346 int len = SCHARS (string);
1347 int len_byte = SBYTES (string);
1348 /* This set multibyte to 0 even if STRING is multibyte when it
1349 contains only ascii and eight-bit-graphic, but that's
1350 intentional. */
1351 int multibyte = len < len_byte;
1352 const unsigned char *str = SDATA (string);
1353 int i = 0, i_byte = 0;
1354 int width = 0;
1355 struct Lisp_Char_Table *dp = buffer_display_table ();
1357 while (i < len)
1359 int chars, bytes, thiswidth;
1360 Lisp_Object val;
1361 int cmp_id;
1362 int ignore, end;
1364 if (find_composition (i, -1, &ignore, &end, &val, string)
1365 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
1366 >= 0))
1368 thiswidth = composition_table[cmp_id]->width;
1369 chars = end - i;
1370 bytes = string_char_to_byte (string, end) - i_byte;
1372 else if (dp)
1374 int c;
1376 if (multibyte)
1377 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1378 else
1379 c = str[i_byte], bytes = 1;
1380 chars = 1;
1381 val = DISP_CHAR_VECTOR (dp, c);
1382 if (VECTORP (val))
1383 thiswidth = XVECTOR (val)->size;
1384 else
1385 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1387 else
1389 chars = 1;
1390 if (multibyte)
1391 PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes);
1392 else
1393 bytes = 1;
1394 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1397 if (precision > 0
1398 && (width + thiswidth > precision))
1400 *nchars = i;
1401 *nbytes = i_byte;
1402 return width;
1404 i += chars;
1405 i_byte += bytes;
1406 width += thiswidth;
1409 if (precision > 0)
1411 *nchars = i;
1412 *nbytes = i_byte;
1415 return width;
1418 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1419 doc: /* Return width of STRING when displayed in the current buffer.
1420 Width is measured by how many columns it occupies on the screen.
1421 When calculating width of a multibyte character in STRING,
1422 only the base leading-code is considered; the validity of
1423 the following bytes is not checked. Tabs in STRING are always
1424 taken to occupy `tab-width' columns. */)
1425 (string)
1426 Lisp_Object string;
1428 Lisp_Object val;
1430 CHECK_STRING (string);
1431 XSETFASTINT (val, lisp_string_width (string, -1, NULL, NULL));
1432 return val;
1435 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1436 doc: /* Return the direction of CH.
1437 The returned value is 0 for left-to-right and 1 for right-to-left. */)
1438 (ch)
1439 Lisp_Object ch;
1441 int charset;
1443 CHECK_NUMBER (ch);
1444 charset = CHAR_CHARSET (XFASTINT (ch));
1445 if (!CHARSET_DEFINED_P (charset))
1446 invalid_character (XINT (ch));
1447 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1450 /* Return the number of characters in the NBYTES bytes at PTR.
1451 This works by looking at the contents and checking for multibyte sequences.
1452 However, if the current buffer has enable-multibyte-characters = nil,
1453 we treat each byte as a character. */
1456 chars_in_text (ptr, nbytes)
1457 const unsigned char *ptr;
1458 int nbytes;
1460 /* current_buffer is null at early stages of Emacs initialization. */
1461 if (current_buffer == 0
1462 || NILP (current_buffer->enable_multibyte_characters))
1463 return nbytes;
1465 return multibyte_chars_in_text (ptr, nbytes);
1468 /* Return the number of characters in the NBYTES bytes at PTR.
1469 This works by looking at the contents and checking for multibyte sequences.
1470 It ignores enable-multibyte-characters. */
1473 multibyte_chars_in_text (ptr, nbytes)
1474 const unsigned char *ptr;
1475 int nbytes;
1477 const unsigned char *endp;
1478 int chars, bytes;
1480 endp = ptr + nbytes;
1481 chars = 0;
1483 while (ptr < endp)
1485 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1486 ptr += bytes;
1487 chars++;
1490 return chars;
1493 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1494 count the numbers of characters and bytes in it. On counting
1495 bytes, pay attention to the fact that 8-bit characters in the range
1496 0x80..0x9F are represented by 2 bytes in multibyte text. */
1497 void
1498 parse_str_as_multibyte (str, len, nchars, nbytes)
1499 const unsigned char *str;
1500 int len, *nchars, *nbytes;
1502 const unsigned char *endp = str + len;
1503 int n, chars = 0, bytes = 0;
1505 while (str < endp)
1507 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1508 str += n, bytes += n;
1509 else
1510 str++, bytes += 2;
1511 chars++;
1513 *nchars = chars;
1514 *nbytes = bytes;
1515 return;
1518 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1519 It actually converts only 8-bit characters in the range 0x80..0x9F
1520 that don't contruct multibyte characters to multibyte forms. If
1521 NCHARS is nonzero, set *NCHARS to the number of characters in the
1522 text. It is assured that we can use LEN bytes at STR as a work
1523 area and that is enough. Return the number of bytes of the
1524 resulting text. */
1527 str_as_multibyte (str, len, nbytes, nchars)
1528 unsigned char *str;
1529 int len, nbytes, *nchars;
1531 unsigned char *p = str, *endp = str + nbytes;
1532 unsigned char *to;
1533 int chars = 0;
1534 int n;
1536 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1537 p += n, chars++;
1538 if (nchars)
1539 *nchars = chars;
1540 if (p == endp)
1541 return nbytes;
1543 to = p;
1544 nbytes = endp - p;
1545 endp = str + len;
1546 safe_bcopy (p, endp - nbytes, nbytes);
1547 p = endp - nbytes;
1548 while (p < endp)
1550 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1552 while (n--)
1553 *to++ = *p++;
1555 else
1557 *to++ = LEADING_CODE_8_BIT_CONTROL;
1558 *to++ = *p++ + 0x20;
1560 chars++;
1562 if (nchars)
1563 *nchars = chars;
1564 return (to - str);
1567 /* Parse unibyte string at STR of LEN bytes, and return the number of
1568 bytes it may ocupy when converted to multibyte string by
1569 `str_to_multibyte'. */
1572 parse_str_to_multibyte (str, len)
1573 unsigned char *str;
1574 int len;
1576 unsigned char *endp = str + len;
1577 int bytes;
1579 for (bytes = 0; str < endp; str++)
1580 bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2;
1581 return bytes;
1584 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1585 that contains the same single-byte characters. It actually
1586 converts all 8-bit characters to multibyte forms. It is assured
1587 that we can use LEN bytes at STR as a work area and that is
1588 enough. */
1591 str_to_multibyte (str, len, bytes)
1592 unsigned char *str;
1593 int len, bytes;
1595 unsigned char *p = str, *endp = str + bytes;
1596 unsigned char *to;
1598 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1599 if (p == endp)
1600 return bytes;
1601 to = p;
1602 bytes = endp - p;
1603 endp = str + len;
1604 safe_bcopy (p, endp - bytes, bytes);
1605 p = endp - bytes;
1606 while (p < endp)
1608 if (*p < 0x80 || *p >= 0xA0)
1609 *to++ = *p++;
1610 else
1611 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1613 return (to - str);
1616 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1617 actually converts only 8-bit characters in the range 0x80..0x9F to
1618 unibyte forms. */
1621 str_as_unibyte (str, bytes)
1622 unsigned char *str;
1623 int bytes;
1625 unsigned char *p = str, *endp = str + bytes;
1626 unsigned char *to = str;
1628 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1629 to = p;
1630 while (p < endp)
1632 if (*p == LEADING_CODE_8_BIT_CONTROL)
1633 *to++ = *(p + 1) - 0x20, p += 2;
1634 else
1635 *to++ = *p++;
1637 return (to - str);
1641 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
1642 doc: /* Concatenate all the argument characters and make the result a string.
1643 usage: (string &rest CHARACTERS) */)
1644 (n, args)
1645 int n;
1646 Lisp_Object *args;
1648 int i, bufsize;
1649 unsigned char *buf, *p;
1650 int c;
1651 int multibyte = 0;
1652 Lisp_Object ret;
1653 USE_SAFE_ALLOCA;
1655 bufsize = MAX_MULTIBYTE_LENGTH * n;
1656 SAFE_ALLOCA (buf, unsigned char *, bufsize);
1657 p = buf;
1659 for (i = 0; i < n; i++)
1661 CHECK_NUMBER (args[i]);
1662 if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i])))
1663 multibyte = 1;
1666 for (i = 0; i < n; i++)
1668 c = XINT (args[i]);
1669 if (multibyte)
1670 p += CHAR_STRING (c, p);
1671 else
1672 *p++ = c;
1675 ret = make_string_from_bytes (buf, n, p - buf);
1676 SAFE_FREE ();
1678 return ret;
1681 #endif /* emacs */
1684 charset_id_internal (charset_name)
1685 char *charset_name;
1687 Lisp_Object val;
1689 val= Fget (intern (charset_name), Qcharset);
1690 if (!VECTORP (val))
1691 error ("Charset %s is not defined", charset_name);
1693 return (XINT (XVECTOR (val)->contents[0]));
1696 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1697 Ssetup_special_charsets, 0, 0, 0, doc: /* Internal use only. */)
1700 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1701 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1702 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1703 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1704 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1705 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1706 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1707 charset_mule_unicode_0100_24ff
1708 = charset_id_internal ("mule-unicode-0100-24ff");
1709 charset_mule_unicode_2500_33ff
1710 = charset_id_internal ("mule-unicode-2500-33ff");
1711 charset_mule_unicode_e000_ffff
1712 = charset_id_internal ("mule-unicode-e000-ffff");
1713 return Qnil;
1716 void
1717 init_charset_once ()
1719 int i, j, k;
1721 staticpro (&Vcharset_table);
1722 staticpro (&Vcharset_symbol_table);
1723 staticpro (&Vgeneric_character_list);
1725 /* This has to be done here, before we call Fmake_char_table. */
1726 Qcharset_table = intern ("charset-table");
1727 staticpro (&Qcharset_table);
1729 /* Intern this now in case it isn't already done.
1730 Setting this variable twice is harmless.
1731 But don't staticpro it here--that is done in alloc.c. */
1732 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1734 /* Now we are ready to set up this property, so we can
1735 create the charset table. */
1736 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1737 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1739 Qunknown = intern ("unknown");
1740 staticpro (&Qunknown);
1741 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1742 Qunknown);
1744 /* Setup tables. */
1745 for (i = 0; i < 2; i++)
1746 for (j = 0; j < 2; j++)
1747 for (k = 0; k < 128; k++)
1748 iso_charset_table [i][j][k] = -1;
1750 for (i = 0; i < 256; i++)
1751 bytes_by_char_head[i] = 1;
1752 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1753 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1754 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1755 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1757 for (i = 0; i < 128; i++)
1758 width_by_char_head[i] = 1;
1759 for (; i < 256; i++)
1760 width_by_char_head[i] = 4;
1761 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1762 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1763 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1764 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1767 Lisp_Object val;
1769 val = Qnil;
1770 for (i = 0x81; i < 0x90; i++)
1771 val = Fcons (make_number ((i - 0x70) << 7), val);
1772 for (; i < 0x9A; i++)
1773 val = Fcons (make_number ((i - 0x8F) << 14), val);
1774 for (i = 0xA0; i < 0xF0; i++)
1775 val = Fcons (make_number ((i - 0x70) << 7), val);
1776 for (; i < 0xFF; i++)
1777 val = Fcons (make_number ((i - 0xE0) << 14), val);
1778 Vgeneric_character_list = Fnreverse (val);
1781 nonascii_insert_offset = 0;
1782 Vnonascii_translation_table = Qnil;
1785 #ifdef emacs
1787 void
1788 syms_of_charset ()
1790 Qcharset = intern ("charset");
1791 staticpro (&Qcharset);
1793 Qascii = intern ("ascii");
1794 staticpro (&Qascii);
1796 Qeight_bit_control = intern ("eight-bit-control");
1797 staticpro (&Qeight_bit_control);
1799 Qeight_bit_graphic = intern ("eight-bit-graphic");
1800 staticpro (&Qeight_bit_graphic);
1802 /* Define special charsets ascii, eight-bit-control, and
1803 eight-bit-graphic. */
1804 update_charset_table (make_number (CHARSET_ASCII),
1805 make_number (1), make_number (94),
1806 make_number (1),
1807 make_number (0),
1808 make_number ('B'),
1809 make_number (0),
1810 build_string ("ASCII"),
1811 Qnil, /* same as above */
1812 build_string ("ASCII (ISO646 IRV)"));
1813 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1814 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1816 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1817 make_number (1), make_number (96),
1818 make_number (4),
1819 make_number (0),
1820 make_number (-1),
1821 make_number (-1),
1822 build_string ("8-bit control code (0x80..0x9F)"),
1823 Qnil, /* same as above */
1824 Qnil); /* same as above */
1825 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1826 Fput (Qeight_bit_control, Qcharset,
1827 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1829 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1830 make_number (1), make_number (96),
1831 make_number (4),
1832 make_number (0),
1833 make_number (-1),
1834 make_number (-1),
1835 build_string ("8-bit graphic char (0xA0..0xFF)"),
1836 Qnil, /* same as above */
1837 Qnil); /* same as above */
1838 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1839 Fput (Qeight_bit_graphic, Qcharset,
1840 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1842 Qauto_fill_chars = intern ("auto-fill-chars");
1843 staticpro (&Qauto_fill_chars);
1844 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1846 defsubr (&Sdefine_charset);
1847 defsubr (&Sgeneric_character_list);
1848 defsubr (&Sget_unused_iso_final_char);
1849 defsubr (&Sdeclare_equiv_charset);
1850 defsubr (&Sfind_charset_region);
1851 defsubr (&Sfind_charset_string);
1852 defsubr (&Smake_char_internal);
1853 defsubr (&Ssplit_char);
1854 defsubr (&Schar_charset);
1855 defsubr (&Scharset_after);
1856 defsubr (&Siso_charset);
1857 defsubr (&Schar_valid_p);
1858 defsubr (&Sunibyte_char_to_multibyte);
1859 defsubr (&Smultibyte_char_to_unibyte);
1860 defsubr (&Schar_bytes);
1861 defsubr (&Schar_width);
1862 defsubr (&Sstring_width);
1863 defsubr (&Schar_direction);
1864 defsubr (&Sstring);
1865 defsubr (&Ssetup_special_charsets);
1867 DEFVAR_LISP ("charset-list", &Vcharset_list,
1868 doc: /* List of charsets ever defined. */);
1869 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1870 Fcons (Qeight_bit_graphic, Qnil)));
1872 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1873 doc: /* Vector of cons cell of a symbol and translation table ever defined.
1874 An ID of a translation table is an index of this vector. */);
1875 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1877 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1878 doc: /* Leading-code of private TYPE9N charset of column-width 1. */);
1879 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1881 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1882 doc: /* Leading-code of private TYPE9N charset of column-width 2. */);
1883 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1885 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1886 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */);
1887 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1889 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1890 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */);
1891 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1893 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1894 doc: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1895 This is used for converting unibyte text to multibyte,
1896 and for inserting character codes specified by number.
1898 This serves to convert a Latin-1 or similar 8-bit character code
1899 to the corresponding Emacs multibyte character code.
1900 Typically the value should be (- (make-char CHARSET 0) 128),
1901 for your choice of character set.
1902 If `nonascii-translation-table' is non-nil, it overrides this variable. */);
1903 nonascii_insert_offset = 0;
1905 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1906 doc: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1907 This is used for converting unibyte text to multibyte,
1908 and for inserting character codes specified by number.
1910 Conversion is performed only when multibyte characters are enabled,
1911 and it serves to convert a Latin-1 or similar 8-bit character code
1912 to the corresponding Emacs character code.
1914 If this is nil, `nonascii-insert-offset' is used instead.
1915 See also the docstring of `make-translation-table'. */);
1916 Vnonascii_translation_table = Qnil;
1918 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1919 doc: /* A char-table for characters which invoke auto-filling.
1920 Such characters have value t in this table. */);
1921 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1922 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1923 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1926 #endif /* emacs */
1928 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1929 (do not change this comment) */