*** empty log message ***
[emacs.git] / src / charset.c
blobd630ec995d66760ad5308d4610d9fa8c86285605
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* At first, see the document in `charset.h' to understand the code in
24 this file. */
26 #ifdef emacs
27 #include <config.h>
28 #endif
30 #include <stdio.h>
32 #ifdef emacs
34 #include <sys/types.h>
35 #include "lisp.h"
36 #include "buffer.h"
37 #include "charset.h"
38 #include "composite.h"
39 #include "coding.h"
40 #include "disptab.h"
42 #else /* not emacs */
44 #include "mulelib.h"
46 #endif /* emacs */
48 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
49 Lisp_Object Qunknown;
51 /* Declaration of special leading-codes. */
52 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
53 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
54 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
55 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
57 /* Declaration of special charsets. The values are set by
58 Fsetup_special_charsets. */
59 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
60 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
61 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
62 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
63 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
64 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
65 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
67 Lisp_Object Qcharset_table;
69 /* A char-table containing information of each character set. */
70 Lisp_Object Vcharset_table;
72 /* A vector of charset symbol indexed by charset-id. This is used
73 only for returning charset symbol from C functions. */
74 Lisp_Object Vcharset_symbol_table;
76 /* A list of charset symbols ever defined. */
77 Lisp_Object Vcharset_list;
79 /* Vector of translation table ever defined.
80 ID of a translation table is used to index this vector. */
81 Lisp_Object Vtranslation_table_vector;
83 /* A char-table for characters which may invoke auto-filling. */
84 Lisp_Object Vauto_fill_chars;
86 Lisp_Object Qauto_fill_chars;
88 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
89 int bytes_by_char_head[256];
90 int width_by_char_head[256];
92 /* Mapping table from ISO2022's charset (specified by DIMENSION,
93 CHARS, and FINAL-CHAR) to Emacs' charset. */
94 int iso_charset_table[2][2][128];
96 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
97 unsigned char *_fetch_multibyte_char_p;
98 int _fetch_multibyte_char_len;
100 /* Offset to add to a non-ASCII value when inserting it. */
101 int nonascii_insert_offset;
103 /* Translation table for converting non-ASCII unibyte characters
104 to multibyte codes, or nil. */
105 Lisp_Object Vnonascii_translation_table;
107 /* List of all possible generic characters. */
108 Lisp_Object Vgeneric_character_list;
110 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
111 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
113 void
114 invalid_character (c)
115 int c;
117 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
120 /* Parse string STR of length LENGTH and fetch information of a
121 character at STR. Set BYTES to the byte length the character
122 occupies, CHARSET, C1, C2 to proper values of the character. */
124 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
125 do { \
126 (c1) = *(str); \
127 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
128 if ((bytes) == 1) \
129 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
130 else if ((bytes) == 2) \
132 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
133 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
134 else \
135 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
137 else if ((bytes) == 3) \
139 if ((c1) < LEADING_CODE_PRIVATE_11) \
140 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
141 else \
142 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
144 else \
145 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
146 } while (0)
148 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
149 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
150 ((charset) == CHARSET_ASCII \
151 ? ((c1) >= 0 && (c1) <= 0x7F) \
152 : ((charset) == CHARSET_8_BIT_CONTROL \
153 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
154 : ((charset) == CHARSET_8_BIT_GRAPHIC \
155 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
156 : (CHARSET_DIMENSION (charset) == 1 \
157 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
158 : ((c1) >= 0x20 && (c1) <= 0x7F \
159 && (c2) >= 0x20 && (c2) <= 0x7F)))))
161 /* Store multi-byte form of the character C in STR. The caller should
162 allocate at least 4-byte area at STR in advance. Returns the
163 length of the multi-byte form. If C is an invalid character code,
164 return -1. */
167 char_to_string_1 (c, str)
168 int c;
169 unsigned char *str;
171 unsigned char *p = str;
173 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
175 /* Multibyte character can't have a modifier bit. */
176 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
177 return -1;
179 /* For Meta, Shift, and Control modifiers, we need special care. */
180 if (c & CHAR_META)
182 /* Move the meta bit to the right place for a string. */
183 c = (c & ~CHAR_META) | 0x80;
185 if (c & CHAR_SHIFT)
187 /* Shift modifier is valid only with [A-Za-z]. */
188 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
189 c &= ~CHAR_SHIFT;
190 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
191 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
193 if (c & CHAR_CTL)
195 /* Simulate the code in lread.c. */
196 /* Allow `\C- ' and `\C-?'. */
197 if (c == (CHAR_CTL | ' '))
198 c = 0;
199 else if (c == (CHAR_CTL | '?'))
200 c = 127;
201 /* ASCII control chars are made from letters (both cases),
202 as well as the non-letters within 0100...0137. */
203 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
204 c &= (037 | (~0177 & ~CHAR_CTL));
205 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
206 c &= (037 | (~0177 & ~CHAR_CTL));
209 /* If C still has any modifier bits, just ignore it. */
210 c &= ~CHAR_MODIFIER_MASK;
213 if (SINGLE_BYTE_CHAR_P (c))
215 if (ASCII_BYTE_P (c) || c >= 0xA0)
216 *p++ = c;
217 else
219 *p++ = LEADING_CODE_8_BIT_CONTROL;
220 *p++ = c + 0x20;
223 else if (CHAR_VALID_P (c, 0))
225 int charset, c1, c2;
227 SPLIT_CHAR (c, charset, c1, c2);
229 if (charset >= LEADING_CODE_EXT_11)
230 *p++ = (charset < LEADING_CODE_EXT_12
231 ? LEADING_CODE_PRIVATE_11
232 : (charset < LEADING_CODE_EXT_21
233 ? LEADING_CODE_PRIVATE_12
234 : (charset < LEADING_CODE_EXT_22
235 ? LEADING_CODE_PRIVATE_21
236 : LEADING_CODE_PRIVATE_22)));
237 *p++ = charset;
238 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
239 return -1;
240 if (c1)
242 *p++ = c1 | 0x80;
243 if (c2 > 0)
244 *p++ = c2 | 0x80;
247 else
248 return -1;
250 return (p - str);
254 /* Store multi-byte form of the character C in STR. The caller should
255 allocate at least 4-byte area at STR in advance. Returns the
256 length of the multi-byte form. If C is an invalid character code,
257 signal an error.
259 Use macro `CHAR_STRING (C, STR)' instead of calling this function
260 directly if C can be an ASCII character. */
263 char_to_string (c, str)
264 int c;
265 unsigned char *str;
267 int len;
268 len = char_to_string_1 (c, str);
269 if (len == -1)
270 invalid_character (c);
271 return len;
275 /* Return the non-ASCII character corresponding to multi-byte form at
276 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
277 length of the multibyte form in *ACTUAL_LEN.
279 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
280 this function directly if you want ot handle ASCII characters as
281 well. */
284 string_to_char (str, len, actual_len)
285 const unsigned char *str;
286 int len, *actual_len;
288 int c, bytes, charset, c1, c2;
290 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
291 c = MAKE_CHAR (charset, c1, c2);
292 if (actual_len)
293 *actual_len = bytes;
294 return c;
297 /* Return the length of the multi-byte form at string STR of length LEN.
298 Use the macro MULTIBYTE_FORM_LENGTH instead. */
300 multibyte_form_length (str, len)
301 const unsigned char *str;
302 int len;
304 int bytes;
306 PARSE_MULTIBYTE_SEQ (str, len, bytes);
307 return bytes;
310 /* Check multibyte form at string STR of length LEN and set variables
311 pointed by CHARSET, C1, and C2 to charset and position codes of the
312 character at STR, and return 0. If there's no multibyte character,
313 return -1. This should be used only in the macro SPLIT_STRING
314 which checks range of STR in advance. */
317 split_string (str, len, charset, c1, c2)
318 const unsigned char *str;
319 unsigned char *c1, *c2;
320 int len, *charset;
322 register int bytes, cs, code1, code2 = -1;
324 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
325 if (cs == CHARSET_ASCII)
326 return -1;
327 *charset = cs;
328 *c1 = code1;
329 *c2 = code2;
330 return 0;
333 /* Return 1 iff character C has valid printable glyph.
334 Use the macro CHAR_PRINTABLE_P instead. */
336 char_printable_p (c)
337 int c;
339 int charset, c1, c2;
341 if (ASCII_BYTE_P (c))
342 return 1;
343 else if (SINGLE_BYTE_CHAR_P (c))
344 return 0;
345 else if (c >= MAX_CHAR)
346 return 0;
348 SPLIT_CHAR (c, charset, c1, c2);
349 if (! CHARSET_DEFINED_P (charset))
350 return 0;
351 if (CHARSET_CHARS (charset) == 94
352 ? c1 <= 32 || c1 >= 127
353 : c1 < 32)
354 return 0;
355 if (CHARSET_DIMENSION (charset) == 2
356 && (CHARSET_CHARS (charset) == 94
357 ? c2 <= 32 || c2 >= 127
358 : c2 < 32))
359 return 0;
360 return 1;
363 /* Translate character C by translation table TABLE. If C
364 is negative, translate a character specified by CHARSET, C1, and C2
365 (C1 and C2 are code points of the character). If no translation is
366 found in TABLE, return C. */
368 translate_char (table, c, charset, c1, c2)
369 Lisp_Object table;
370 int c, charset, c1, c2;
372 Lisp_Object ch;
373 int alt_charset, alt_c1, alt_c2, dimension;
375 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
376 if (!CHAR_TABLE_P (table)
377 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
378 return c;
380 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
381 dimension = CHARSET_DIMENSION (alt_charset);
382 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
383 /* CH is not a generic character, just return it. */
384 return XFASTINT (ch);
386 /* Since CH is a generic character, we must return a specific
387 charater which has the same position codes as C from CH. */
388 if (charset < 0)
389 SPLIT_CHAR (c, charset, c1, c2);
390 if (dimension != CHARSET_DIMENSION (charset))
391 /* We can't make such a character because of dimension mismatch. */
392 return c;
393 return MAKE_CHAR (alt_charset, c1, c2);
396 /* Convert the unibyte character C to multibyte based on
397 Vnonascii_translation_table or nonascii_insert_offset. If they can't
398 convert C to a valid multibyte character, convert it based on
399 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
402 unibyte_char_to_multibyte (c)
403 int c;
405 if (c < 0400 && c >= 0200)
407 int c_save = c;
409 if (! NILP (Vnonascii_translation_table))
411 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
412 if (c >= 0400 && ! char_valid_p (c, 0))
413 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
415 else if (c >= 0240 && nonascii_insert_offset > 0)
417 c += nonascii_insert_offset;
418 if (c < 0400 || ! char_valid_p (c, 0))
419 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
421 else if (c >= 0240)
422 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
424 return c;
428 /* Convert the multibyte character C to unibyte 8-bit character based
429 on Vnonascii_translation_table or nonascii_insert_offset. If
430 REV_TBL is non-nil, it should be a reverse table of
431 Vnonascii_translation_table, i.e. what given by:
432 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
435 multibyte_char_to_unibyte (c, rev_tbl)
436 int c;
437 Lisp_Object rev_tbl;
439 if (!SINGLE_BYTE_CHAR_P (c))
441 int c_save = c;
443 if (! CHAR_TABLE_P (rev_tbl)
444 && CHAR_TABLE_P (Vnonascii_translation_table))
445 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
446 make_number (0));
447 if (CHAR_TABLE_P (rev_tbl))
449 Lisp_Object temp;
450 temp = Faref (rev_tbl, make_number (c));
451 if (INTEGERP (temp))
452 c = XINT (temp);
453 if (c >= 256)
454 c = (c_save & 0177) + 0200;
456 else
458 if (nonascii_insert_offset > 0)
459 c -= nonascii_insert_offset;
460 if (c < 128 || c >= 256)
461 c = (c_save & 0177) + 0200;
465 return c;
469 /* Update the table Vcharset_table with the given arguments (see the
470 document of `define-charset' for the meaning of each argument).
471 Several other table contents are also updated. The caller should
472 check the validity of CHARSET-ID and the remaining arguments in
473 advance. */
475 void
476 update_charset_table (charset_id, dimension, chars, width, direction,
477 iso_final_char, iso_graphic_plane,
478 short_name, long_name, description)
479 Lisp_Object charset_id, dimension, chars, width, direction;
480 Lisp_Object iso_final_char, iso_graphic_plane;
481 Lisp_Object short_name, long_name, description;
483 int charset = XINT (charset_id);
484 int bytes;
485 unsigned char leading_code_base, leading_code_ext;
487 if (NILP (CHARSET_TABLE_ENTRY (charset)))
488 CHARSET_TABLE_ENTRY (charset)
489 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
491 if (NILP (long_name))
492 long_name = short_name;
493 if (NILP (description))
494 description = long_name;
496 /* Get byte length of multibyte form, base leading-code, and
497 extended leading-code of the charset. See the comment under the
498 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
499 bytes = XINT (dimension);
500 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
502 /* Official charset, it doesn't have an extended leading-code. */
503 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
504 bytes += 1; /* For a base leading-code. */
505 leading_code_base = charset;
506 leading_code_ext = 0;
508 else
510 /* Private charset. */
511 bytes += 2; /* For base and extended leading-codes. */
512 leading_code_base
513 = (charset < LEADING_CODE_EXT_12
514 ? LEADING_CODE_PRIVATE_11
515 : (charset < LEADING_CODE_EXT_21
516 ? LEADING_CODE_PRIVATE_12
517 : (charset < LEADING_CODE_EXT_22
518 ? LEADING_CODE_PRIVATE_21
519 : LEADING_CODE_PRIVATE_22)));
520 leading_code_ext = charset;
521 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
522 error ("Invalid dimension for the charset-ID %d", charset);
525 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
526 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
527 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
528 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
529 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
530 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
531 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
532 = make_number (leading_code_base);
533 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
534 = make_number (leading_code_ext);
535 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
536 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
537 = iso_graphic_plane;
538 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
539 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
540 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
541 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
544 /* If we have already defined a charset which has the same
545 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
546 DIRECTION, we must update the entry REVERSE-CHARSET of both
547 charsets. If there's no such charset, the value of the entry
548 is set to nil. */
549 int i;
551 for (i = 0; i <= MAX_CHARSET; i++)
552 if (!NILP (CHARSET_TABLE_ENTRY (i)))
554 if (CHARSET_DIMENSION (i) == XINT (dimension)
555 && CHARSET_CHARS (i) == XINT (chars)
556 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
557 && CHARSET_DIRECTION (i) != XINT (direction))
559 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
560 = make_number (i);
561 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
562 break;
565 if (i > MAX_CHARSET)
566 /* No such a charset. */
567 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
568 = make_number (-1);
571 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
572 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
574 bytes_by_char_head[leading_code_base] = bytes;
575 width_by_char_head[leading_code_base] = XINT (width);
577 /* Update table emacs_code_class. */
578 emacs_code_class[charset] = (bytes == 2
579 ? EMACS_leading_code_2
580 : (bytes == 3
581 ? EMACS_leading_code_3
582 : EMACS_leading_code_4));
585 /* Update table iso_charset_table. */
586 if (XINT (iso_final_char) >= 0
587 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
588 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
591 #ifdef emacs
593 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
594 is invalid. */
596 get_charset_id (charset_symbol)
597 Lisp_Object charset_symbol;
599 Lisp_Object val;
600 int charset;
602 /* This originally used a ?: operator, but reportedly the HP-UX
603 compiler version HP92453-01 A.10.32.22 miscompiles that. */
604 if (SYMBOLP (charset_symbol)
605 && VECTORP (val = Fget (charset_symbol, Qcharset))
606 && CHARSET_VALID_P (charset =
607 XINT (XVECTOR (val)->contents[CHARSET_ID_IDX])))
608 return charset;
609 else
610 return -1;
613 /* Return an identification number for a new private charset of
614 DIMENSION and WIDTH. If there's no more room for the new charset,
615 return 0. */
616 Lisp_Object
617 get_new_private_charset_id (dimension, width)
618 int dimension, width;
620 int charset, from, to;
622 if (dimension == 1)
624 from = LEADING_CODE_EXT_11;
625 to = LEADING_CODE_EXT_21;
627 else
629 from = LEADING_CODE_EXT_21;
630 to = LEADING_CODE_EXT_MAX + 1;
633 for (charset = from; charset < to; charset++)
634 if (!CHARSET_DEFINED_P (charset)) break;
636 return make_number (charset < to ? charset : 0);
639 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
640 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
641 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
642 treated as a private charset.\n\
643 INFO-VECTOR is a vector of the format:\n\
644 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
645 SHORT-NAME LONG-NAME DESCRIPTION]\n\
646 The meanings of each elements is as follows:\n\
647 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
648 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
649 WIDTH (integer) is the number of columns a character in the charset\n\
650 occupies on the screen: one of 0, 1, and 2.\n\
652 DIRECTION (integer) is the rendering direction of characters in the\n\
653 charset when rendering. If 0, render from left to right, else\n\
654 render from right to left.\n\
656 ISO-FINAL-CHAR (character) is the final character of the\n\
657 corresponding ISO 2022 charset.\n\
658 It may be -1 if the charset is internal use only.\n\
660 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
661 while encoding to variants of ISO 2022 coding system, one of the\n\
662 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
663 It may be -1 if the charset is internal use only.\n\
665 SHORT-NAME (string) is the short name to refer to the charset.\n\
667 LONG-NAME (string) is the long name to refer to the charset.\n\
669 DESCRIPTION (string) is the description string of the charset.")
670 (charset_id, charset_symbol, info_vector)
671 Lisp_Object charset_id, charset_symbol, info_vector;
673 Lisp_Object *vec;
675 if (!NILP (charset_id))
676 CHECK_NUMBER (charset_id, 0);
677 CHECK_SYMBOL (charset_symbol, 1);
678 CHECK_VECTOR (info_vector, 2);
680 if (! NILP (charset_id))
682 if (! CHARSET_VALID_P (XINT (charset_id)))
683 error ("Invalid CHARSET: %d", XINT (charset_id));
684 else if (CHARSET_DEFINED_P (XINT (charset_id)))
685 error ("Already defined charset: %d", XINT (charset_id));
688 vec = XVECTOR (info_vector)->contents;
689 if (XVECTOR (info_vector)->size != 9
690 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
691 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
692 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
693 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
694 || !INTEGERP (vec[4])
695 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
696 || !INTEGERP (vec[5])
697 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
698 || !STRINGP (vec[6])
699 || !STRINGP (vec[7])
700 || !STRINGP (vec[8]))
701 error ("Invalid info-vector argument for defining charset %s",
702 XSYMBOL (charset_symbol)->name->data);
704 if (NILP (charset_id))
706 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
707 if (XINT (charset_id) == 0)
708 error ("There's no room for a new private charset %s",
709 XSYMBOL (charset_symbol)->name->data);
712 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
713 vec[4], vec[5], vec[6], vec[7], vec[8]);
714 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
715 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
716 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
717 Fupdate_coding_systems_internal ();
718 return Qnil;
721 DEFUN ("generic-character-list", Fgeneric_character_list,
722 Sgeneric_character_list, 0, 0, 0,
723 "Return a list of all possible generic characters.\n\
724 It includes a generic character for a charset not yet defined.")
727 return Vgeneric_character_list;
730 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
731 Sget_unused_iso_final_char, 2, 2, 0,
732 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
733 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
734 CHARS is the number of characters in a dimension: 94 or 96.\n\
736 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
737 If there's no unused final char for the specified kind of charset,\n\
738 return nil.")
739 (dimension, chars)
740 Lisp_Object dimension, chars;
742 int final_char;
744 CHECK_NUMBER (dimension, 0);
745 CHECK_NUMBER (chars, 1);
746 if (XINT (dimension) != 1 && XINT (dimension) != 2)
747 error ("Invalid charset dimension %d, it should be 1 or 2",
748 XINT (dimension));
749 if (XINT (chars) != 94 && XINT (chars) != 96)
750 error ("Invalid charset chars %d, it should be 94 or 96",
751 XINT (chars));
752 for (final_char = '0'; final_char <= '?'; final_char++)
754 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
755 break;
757 return (final_char <= '?' ? make_number (final_char) : Qnil);
760 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
761 4, 4, 0,
762 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
763 CHARSET should be defined by `defined-charset' in advance.")
764 (dimension, chars, final_char, charset_symbol)
765 Lisp_Object dimension, chars, final_char, charset_symbol;
767 int charset;
769 CHECK_NUMBER (dimension, 0);
770 CHECK_NUMBER (chars, 1);
771 CHECK_NUMBER (final_char, 2);
772 CHECK_SYMBOL (charset_symbol, 3);
774 if (XINT (dimension) != 1 && XINT (dimension) != 2)
775 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
776 if (XINT (chars) != 94 && XINT (chars) != 96)
777 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
778 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
779 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
780 if ((charset = get_charset_id (charset_symbol)) < 0)
781 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
783 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
784 return Qnil;
787 /* Return information about charsets in the text at PTR of NBYTES
788 bytes, which are NCHARS characters. The value is:
790 0: Each character is represented by one byte. This is always
791 true for unibyte text.
792 1: No charsets other than ascii eight-bit-control,
793 eight-bit-graphic, and latin-1 are found.
794 2: Otherwise.
796 In addition, if CHARSETS is nonzero, for each found charset N, set
797 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
798 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
799 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
800 1 (note that there's no charset whose ID is 1). */
803 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
804 unsigned char *ptr;
805 int nchars, nbytes, *charsets;
806 Lisp_Object table;
808 if (nchars == nbytes)
810 if (charsets && nbytes > 0)
812 unsigned char *endp = ptr + nbytes;
813 int maskbits = 0;
815 while (ptr < endp && maskbits != 7)
817 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
818 ptr++;
821 if (maskbits & 1)
822 charsets[CHARSET_ASCII] = 1;
823 if (maskbits & 2)
824 charsets[CHARSET_8_BIT_CONTROL] = 1;
825 if (maskbits & 4)
826 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
828 return 0;
830 else
832 int return_val = 1;
833 int bytes, charset, c1, c2;
835 if (! CHAR_TABLE_P (table))
836 table = Qnil;
838 while (nchars-- > 0)
840 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
841 ptr += bytes;
843 if (!CHARSET_DEFINED_P (charset))
844 charset = 1;
845 else if (! NILP (table))
847 int c = translate_char (table, -1, charset, c1, c2);
848 if (c >= 0)
849 charset = CHAR_CHARSET (c);
852 if (return_val == 1
853 && charset != CHARSET_ASCII
854 && charset != CHARSET_8_BIT_CONTROL
855 && charset != CHARSET_8_BIT_GRAPHIC
856 && charset != charset_latin_iso8859_1)
857 return_val = 2;
859 if (charsets)
860 charsets[charset] = 1;
861 else if (return_val == 2)
862 break;
864 return return_val;
868 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
869 2, 3, 0,
870 "Return a list of charsets in the region between BEG and END.\n\
871 BEG and END are buffer positions.\n\
872 Optional arg TABLE if non-nil is a translation table to look up.\n\
874 If the region contains invalid multibyte characters,\n\
875 `unknown' is included in the returned list.\n\
877 If the current buffer is unibyte, the returned list may contain\n\
878 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
879 (beg, end, table)
880 Lisp_Object beg, end, table;
882 int charsets[MAX_CHARSET + 1];
883 int from, from_byte, to, stop, stop_byte, i;
884 Lisp_Object val;
886 validate_region (&beg, &end);
887 from = XFASTINT (beg);
888 stop = to = XFASTINT (end);
890 if (from < GPT && GPT < to)
892 stop = GPT;
893 stop_byte = GPT_BYTE;
895 else
896 stop_byte = CHAR_TO_BYTE (stop);
898 from_byte = CHAR_TO_BYTE (from);
900 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
901 while (1)
903 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
904 stop_byte - from_byte, charsets, table);
905 if (stop < to)
907 from = stop, from_byte = stop_byte;
908 stop = to, stop_byte = CHAR_TO_BYTE (stop);
910 else
911 break;
914 val = Qnil;
915 if (charsets[1])
916 val = Fcons (Qunknown, val);
917 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
918 if (charsets[i])
919 val = Fcons (CHARSET_SYMBOL (i), val);
920 if (charsets[0])
921 val = Fcons (Qascii, val);
922 return val;
925 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
926 1, 2, 0,
927 "Return a list of charsets in STR.\n\
928 Optional arg TABLE if non-nil is a translation table to look up.\n\
930 If the string contains invalid multibyte characters,\n\
931 `unknown' is included in the returned list.\n\
933 If STR is unibyte, the returned list may contain\n\
934 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
935 (str, table)
936 Lisp_Object str, table;
938 int charsets[MAX_CHARSET + 1];
939 int i;
940 Lisp_Object val;
942 CHECK_STRING (str, 0);
944 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
945 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
946 STRING_BYTES (XSTRING (str)), charsets, table);
948 val = Qnil;
949 if (charsets[1])
950 val = Fcons (Qunknown, val);
951 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
952 if (charsets[i])
953 val = Fcons (CHARSET_SYMBOL (i), val);
954 if (charsets[0])
955 val = Fcons (Qascii, val);
956 return val;
960 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
962 (charset, code1, code2)
963 Lisp_Object charset, code1, code2;
965 int charset_id, c1, c2;
967 CHECK_NUMBER (charset, 0);
968 charset_id = XINT (charset);
969 if (!CHARSET_DEFINED_P (charset_id))
970 error ("Invalid charset ID: %d", XINT (charset));
972 if (NILP (code1))
973 c1 = 0;
974 else
976 CHECK_NUMBER (code1, 1);
977 c1 = XINT (code1);
979 if (NILP (code2))
980 c2 = 0;
981 else
983 CHECK_NUMBER (code2, 2);
984 c2 = XINT (code2);
987 if (charset_id == CHARSET_ASCII)
989 if (c1 < 0 || c1 > 0x7F)
990 goto invalid_code_posints;
991 return make_number (c1);
993 else if (charset_id == CHARSET_8_BIT_CONTROL)
995 if (NILP (code1))
996 c1 = 0x80;
997 else if (c1 < 0x80 || c1 > 0x9F)
998 goto invalid_code_posints;
999 return make_number (c1);
1001 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
1003 if (NILP (code1))
1004 c1 = 0xA0;
1005 else if (c1 < 0xA0 || c1 > 0xFF)
1006 goto invalid_code_posints;
1007 return make_number (c1);
1009 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1010 goto invalid_code_posints;
1011 c1 &= 0x7F;
1012 c2 &= 0x7F;
1013 if (c1 == 0
1014 ? c2 != 0
1015 : (c2 == 0
1016 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
1017 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
1018 goto invalid_code_posints;
1019 return make_number (MAKE_CHAR (charset_id, c1, c2));
1021 invalid_code_posints:
1022 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1025 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1026 "Return list of charset and one or two position-codes of CHAR.\n\
1027 If CHAR is invalid as a character code,\n\
1028 return a list of symbol `unknown' and CHAR.")
1029 (ch)
1030 Lisp_Object ch;
1032 int c, charset, c1, c2;
1034 CHECK_NUMBER (ch, 0);
1035 c = XFASTINT (ch);
1036 if (!CHAR_VALID_P (c, 1))
1037 return Fcons (Qunknown, Fcons (ch, Qnil));
1038 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1039 return (c2 >= 0
1040 ? Fcons (CHARSET_SYMBOL (charset),
1041 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1042 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1045 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1046 "Return charset of CHAR.")
1047 (ch)
1048 Lisp_Object ch;
1050 CHECK_NUMBER (ch, 0);
1052 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1055 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1056 "Return charset of a character in the current buffer at position POS.\n\
1057 If POS is nil, it defauls to the current point.\n\
1058 If POS is out of range, the value is nil.")
1059 (pos)
1060 Lisp_Object pos;
1062 Lisp_Object ch;
1063 int charset;
1065 ch = Fchar_after (pos);
1066 if (! INTEGERP (ch))
1067 return ch;
1068 charset = CHAR_CHARSET (XINT (ch));
1069 return CHARSET_SYMBOL (charset);
1072 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1073 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1075 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1076 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1077 where as Emacs distinguishes them by charset symbol.\n\
1078 See the documentation of the function `charset-info' for the meanings of\n\
1079 DIMENSION, CHARS, and FINAL-CHAR.")
1080 (dimension, chars, final_char)
1081 Lisp_Object dimension, chars, final_char;
1083 int charset;
1085 CHECK_NUMBER (dimension, 0);
1086 CHECK_NUMBER (chars, 1);
1087 CHECK_NUMBER (final_char, 2);
1089 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1090 return Qnil;
1091 return CHARSET_SYMBOL (charset);
1094 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1095 generic character. If GENERICP is zero, return nonzero iff C is a
1096 valid normal character. Do not call this function directly,
1097 instead use macro CHAR_VALID_P. */
1099 char_valid_p (c, genericp)
1100 int c, genericp;
1102 int charset, c1, c2;
1104 if (c < 0 || c >= MAX_CHAR)
1105 return 0;
1106 if (SINGLE_BYTE_CHAR_P (c))
1107 return 1;
1108 SPLIT_CHAR (c, charset, c1, c2);
1109 if (genericp)
1111 if (c1)
1113 if (c2 <= 0) c2 = 0x20;
1115 else
1117 if (c2 <= 0) c1 = c2 = 0x20;
1120 return (CHARSET_DEFINED_P (charset)
1121 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1124 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1125 "Return t if OBJECT is a valid normal character.\n\
1126 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1127 a valid generic character.")
1128 (object, genericp)
1129 Lisp_Object object, genericp;
1131 if (! NATNUMP (object))
1132 return Qnil;
1133 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1136 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1137 Sunibyte_char_to_multibyte, 1, 1, 0,
1138 "Convert the unibyte character CH to multibyte character.\n\
1139 The conversion is done based on `nonascii-translation-table' (which see)\n\
1140 or `nonascii-insert-offset' (which see).")
1141 (ch)
1142 Lisp_Object ch;
1144 int c;
1146 CHECK_NUMBER (ch, 0);
1147 c = XINT (ch);
1148 if (c < 0 || c >= 0400)
1149 error ("Invalid unibyte character: %d", c);
1150 c = unibyte_char_to_multibyte (c);
1151 if (c < 0)
1152 error ("Can't convert to multibyte character: %d", XINT (ch));
1153 return make_number (c);
1156 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1157 Smultibyte_char_to_unibyte, 1, 1, 0,
1158 "Convert the multibyte character CH to unibyte character.\n\
1159 The conversion is done based on `nonascii-translation-table' (which see)\n\
1160 or `nonascii-insert-offset' (which see).")
1161 (ch)
1162 Lisp_Object ch;
1164 int c;
1166 CHECK_NUMBER (ch, 0);
1167 c = XINT (ch);
1168 if (! CHAR_VALID_P (c, 0))
1169 error ("Invalid multibyte character: %d", c);
1170 c = multibyte_char_to_unibyte (c, Qnil);
1171 if (c < 0)
1172 error ("Can't convert to unibyte character: %d", XINT (ch));
1173 return make_number (c);
1176 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1177 "Return 1 regardless of the argument CHAR.\n\
1178 This is now an obsolete function. We keep it just for backward compatibility.")
1179 (ch)
1180 Lisp_Object ch;
1182 CHECK_NUMBER (ch, 0);
1183 return make_number (1);
1186 /* Return how many bytes C will occupy in a multibyte buffer.
1187 Don't call this function directly, instead use macro CHAR_BYTES. */
1189 char_bytes (c)
1190 int c;
1192 int charset;
1194 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1195 return 1;
1196 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1197 return 1;
1199 charset = CHAR_CHARSET (c);
1200 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1203 /* Return the width of character of which multi-byte form starts with
1204 C. The width is measured by how many columns occupied on the
1205 screen when displayed in the current buffer. */
1207 #define ONE_BYTE_CHAR_WIDTH(c) \
1208 (c < 0x20 \
1209 ? (c == '\t' \
1210 ? XFASTINT (current_buffer->tab_width) \
1211 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1212 : (c < 0x7f \
1213 ? 1 \
1214 : (c == 0x7F \
1215 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1216 : ((! NILP (current_buffer->enable_multibyte_characters) \
1217 && BASE_LEADING_CODE_P (c)) \
1218 ? WIDTH_BY_CHAR_HEAD (c) \
1219 : 4))))
1221 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1222 "Return width of CHAR when displayed in the current buffer.\n\
1223 The width is measured by how many columns it occupies on the screen.\n\
1224 Tab is taken to occupy `tab-width' columns.")
1225 (ch)
1226 Lisp_Object ch;
1228 Lisp_Object val, disp;
1229 int c;
1230 struct Lisp_Char_Table *dp = buffer_display_table ();
1232 CHECK_NUMBER (ch, 0);
1234 c = XINT (ch);
1236 /* Get the way the display table would display it. */
1237 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1239 if (VECTORP (disp))
1240 XSETINT (val, XVECTOR (disp)->size);
1241 else if (SINGLE_BYTE_CHAR_P (c))
1242 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1243 else
1245 int charset = CHAR_CHARSET (c);
1247 XSETFASTINT (val, CHARSET_WIDTH (charset));
1249 return val;
1252 /* Return width of string STR of length LEN when displayed in the
1253 current buffer. The width is measured by how many columns it
1254 occupies on the screen. */
1257 strwidth (str, len)
1258 unsigned char *str;
1259 int len;
1261 return c_string_width (str, len, -1, NULL, NULL);
1264 /* Return width of string STR of length LEN when displayed in the
1265 current buffer. The width is measured by how many columns it
1266 occupies on the screen. If PRECISION > 0, return the width of
1267 longest substring that doesn't exceed PRECISION, and set number of
1268 characters and bytes of the substring in *NCHARS and *NBYTES
1269 respectively. */
1272 c_string_width (str, len, precision, nchars, nbytes)
1273 unsigned char *str;
1274 int precision, *nchars, *nbytes;
1276 int i = 0, i_byte = 0;
1277 int width = 0;
1278 int chars;
1279 struct Lisp_Char_Table *dp = buffer_display_table ();
1281 while (i_byte < len)
1283 int bytes, thiswidth;
1284 Lisp_Object val;
1286 if (dp)
1288 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1290 chars = 1;
1291 val = DISP_CHAR_VECTOR (dp, c);
1292 if (VECTORP (val))
1293 thiswidth = XVECTOR (val)->size;
1294 else
1295 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1297 else
1299 chars = 1;
1300 PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes);
1301 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1304 if (precision > 0
1305 && (width + thiswidth > precision))
1307 *nchars = i;
1308 *nbytes = i_byte;
1309 return width;
1311 i++;
1312 i_byte += bytes;
1313 width += thiswidth;
1316 if (precision > 0)
1318 *nchars = i;
1319 *nbytes = i_byte;
1322 return width;
1325 /* Return width of Lisp string STRING when displayed in the current
1326 buffer. The width is measured by how many columns it occupies on
1327 the screen while paying attention to compositions. If PRECISION >
1328 0, return the width of longest substring that doesn't exceed
1329 PRECISION, and set number of characters and bytes of the substring
1330 in *NCHARS and *NBYTES respectively. */
1333 lisp_string_width (string, precision, nchars, nbytes)
1334 Lisp_Object string;
1335 int precision, *nchars, *nbytes;
1337 int len = XSTRING (string)->size;
1338 int len_byte = STRING_BYTES (XSTRING (string));
1339 unsigned char *str = XSTRING (string)->data;
1340 int i = 0, i_byte = 0;
1341 int width = 0;
1342 struct Lisp_Char_Table *dp = buffer_display_table ();
1344 while (i < len)
1346 int chars, bytes, thiswidth;
1347 Lisp_Object val;
1348 int cmp_id;
1349 int ignore, end;
1351 if (find_composition (i, -1, &ignore, &end, &val, string)
1352 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
1353 >= 0))
1355 thiswidth = composition_table[cmp_id]->width;
1356 chars = end - i;
1357 bytes = string_char_to_byte (string, end) - i_byte;
1359 else if (dp)
1361 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1363 chars = 1;
1364 val = DISP_CHAR_VECTOR (dp, c);
1365 if (VECTORP (val))
1366 thiswidth = XVECTOR (val)->size;
1367 else
1368 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1370 else
1372 chars = 1;
1373 PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes);
1374 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1377 if (precision > 0
1378 && (width + thiswidth > precision))
1380 *nchars = i;
1381 *nbytes = i_byte;
1382 return width;
1384 i += chars;
1385 i_byte += bytes;
1386 width += thiswidth;
1389 if (precision > 0)
1391 *nchars = i;
1392 *nbytes = i_byte;
1395 return width;
1398 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1399 "Return width of STRING when displayed in the current buffer.\n\
1400 Width is measured by how many columns it occupies on the screen.\n\
1401 When calculating width of a multibyte character in STRING,\n\
1402 only the base leading-code is considered; the validity of\n\
1403 the following bytes is not checked. Tabs in STRING are always\n\
1404 taken to occupy `tab-width' columns.")
1405 (str)
1406 Lisp_Object str;
1408 Lisp_Object val;
1410 CHECK_STRING (str, 0);
1411 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
1412 return val;
1415 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1416 "Return the direction of CHAR.\n\
1417 The returned value is 0 for left-to-right and 1 for right-to-left.")
1418 (ch)
1419 Lisp_Object ch;
1421 int charset;
1423 CHECK_NUMBER (ch, 0);
1424 charset = CHAR_CHARSET (XFASTINT (ch));
1425 if (!CHARSET_DEFINED_P (charset))
1426 invalid_character (XINT (ch));
1427 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1430 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1431 "Return number of characters between BEG and END.")
1432 (beg, end)
1433 Lisp_Object beg, end;
1435 int from, to;
1437 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1438 CHECK_NUMBER_COERCE_MARKER (end, 1);
1440 from = min (XFASTINT (beg), XFASTINT (end));
1441 to = max (XFASTINT (beg), XFASTINT (end));
1443 return make_number (to - from);
1446 /* Return the number of characters in the NBYTES bytes at PTR.
1447 This works by looking at the contents and checking for multibyte sequences.
1448 However, if the current buffer has enable-multibyte-characters = nil,
1449 we treat each byte as a character. */
1452 chars_in_text (ptr, nbytes)
1453 unsigned char *ptr;
1454 int nbytes;
1456 /* current_buffer is null at early stages of Emacs initialization. */
1457 if (current_buffer == 0
1458 || NILP (current_buffer->enable_multibyte_characters))
1459 return nbytes;
1461 return multibyte_chars_in_text (ptr, nbytes);
1464 /* Return the number of characters in the NBYTES bytes at PTR.
1465 This works by looking at the contents and checking for multibyte sequences.
1466 It ignores enable-multibyte-characters. */
1469 multibyte_chars_in_text (ptr, nbytes)
1470 unsigned char *ptr;
1471 int nbytes;
1473 unsigned char *endp;
1474 int chars, bytes;
1476 endp = ptr + nbytes;
1477 chars = 0;
1479 while (ptr < endp)
1481 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1482 ptr += bytes;
1483 chars++;
1486 return chars;
1489 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1490 count the numbers of characters and bytes in it. On counting
1491 bytes, pay attention to the fact that 8-bit characters in the range
1492 0x80..0x9F are represented by 2 bytes in multibyte text. */
1493 void
1494 parse_str_as_multibyte (str, len, nchars, nbytes)
1495 unsigned char *str;
1496 int len, *nchars, *nbytes;
1498 unsigned char *endp = str + len;
1499 int n, chars = 0, bytes = 0;
1501 while (str < endp)
1503 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1504 str += n, bytes += n;
1505 else
1506 str++, bytes += 2;
1507 chars++;
1509 *nchars = chars;
1510 *nbytes = bytes;
1511 return;
1514 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1515 It actually converts only 8-bit characters in the range 0x80..0x9F
1516 that don't contruct multibyte characters to multibyte forms. If
1517 NCHARS is nonzero, set *NCHARS to the number of characters in the
1518 text. It is assured that we can use LEN bytes at STR as a work
1519 area and that is enough. Return the number of bytes of the
1520 resulting text. */
1523 str_as_multibyte (str, len, nbytes, nchars)
1524 unsigned char *str;
1525 int len, nbytes, *nchars;
1527 unsigned char *p = str, *endp = str + nbytes;
1528 unsigned char *to;
1529 int chars = 0;
1530 int n;
1532 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1533 p += n, chars++;
1534 if (nchars)
1535 *nchars = chars;
1536 if (p == endp)
1537 return nbytes;
1539 to = p;
1540 nbytes = endp - p;
1541 endp = str + len;
1542 safe_bcopy (p, endp - nbytes, nbytes);
1543 p = endp - nbytes;
1544 while (p < endp)
1546 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1548 while (n--)
1549 *to++ = *p++;
1551 else
1553 *to++ = LEADING_CODE_8_BIT_CONTROL;
1554 *to++ = *p++ + 0x20;
1556 chars++;
1558 if (nchars)
1559 *nchars = chars;
1560 return (to - str);
1563 /* Parse unibyte string at STR of LEN bytes, and return the number of
1564 bytes it may ocupy when converted to multibyte string by
1565 `str_to_multibyte'. */
1568 parse_str_to_multibyte (str, len)
1569 unsigned char *str;
1570 int len;
1572 unsigned char *endp = str + len;
1573 int bytes;
1575 for (bytes = 0; str < endp; str++)
1576 bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2;
1577 return bytes;
1580 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1581 that contains the same single-byte characters. It actually
1582 converts all 8-bit characters to multibyte forms. It is assured
1583 that we can use LEN bytes at STR as a work area and that is
1584 enough. */
1587 str_to_multibyte (str, len, bytes)
1588 unsigned char *str;
1589 int len, bytes;
1591 unsigned char *p = str, *endp = str + bytes;
1592 unsigned char *to;
1594 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1595 if (p == endp)
1596 return bytes;
1597 to = p;
1598 bytes = endp - p;
1599 endp = str + len;
1600 safe_bcopy (p, endp - bytes, bytes);
1601 p = endp - bytes;
1602 while (p < endp)
1604 if (*p < 0x80 || *p >= 0xA0)
1605 *to++ = *p++;
1606 else
1607 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1609 return (to - str);
1612 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1613 actually converts only 8-bit characters in the range 0x80..0x9F to
1614 unibyte forms. */
1617 str_as_unibyte (str, bytes)
1618 unsigned char *str;
1619 int bytes;
1621 unsigned char *p = str, *endp = str + bytes;
1622 unsigned char *to = str;
1624 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1625 to = p;
1626 while (p < endp)
1628 if (*p == LEADING_CODE_8_BIT_CONTROL)
1629 *to++ = *(p + 1) - 0x20, p += 2;
1630 else
1631 *to++ = *p++;
1633 return (to - str);
1637 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1638 "Concatenate all the argument characters and make the result a string.")
1639 (n, args)
1640 int n;
1641 Lisp_Object *args;
1643 int i;
1644 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1645 unsigned char *p = buf;
1646 int c;
1647 int multibyte = 0;
1649 for (i = 0; i < n; i++)
1651 CHECK_NUMBER (args[i], 0);
1652 if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i])))
1653 multibyte = 1;
1656 for (i = 0; i < n; i++)
1658 c = XINT (args[i]);
1659 if (multibyte)
1660 p += CHAR_STRING (c, p);
1661 else
1662 *p++ = c;
1665 return make_string_from_bytes (buf, n, p - buf);
1668 #endif /* emacs */
1671 charset_id_internal (charset_name)
1672 char *charset_name;
1674 Lisp_Object val;
1676 val= Fget (intern (charset_name), Qcharset);
1677 if (!VECTORP (val))
1678 error ("Charset %s is not defined", charset_name);
1680 return (XINT (XVECTOR (val)->contents[0]));
1683 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1684 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1687 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1688 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1689 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1690 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1691 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1692 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1693 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1694 return Qnil;
1697 void
1698 init_charset_once ()
1700 int i, j, k;
1702 staticpro (&Vcharset_table);
1703 staticpro (&Vcharset_symbol_table);
1704 staticpro (&Vgeneric_character_list);
1706 /* This has to be done here, before we call Fmake_char_table. */
1707 Qcharset_table = intern ("charset-table");
1708 staticpro (&Qcharset_table);
1710 /* Intern this now in case it isn't already done.
1711 Setting this variable twice is harmless.
1712 But don't staticpro it here--that is done in alloc.c. */
1713 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1715 /* Now we are ready to set up this property, so we can
1716 create the charset table. */
1717 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1718 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1720 Qunknown = intern ("unknown");
1721 staticpro (&Qunknown);
1722 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1723 Qunknown);
1725 /* Setup tables. */
1726 for (i = 0; i < 2; i++)
1727 for (j = 0; j < 2; j++)
1728 for (k = 0; k < 128; k++)
1729 iso_charset_table [i][j][k] = -1;
1731 for (i = 0; i < 256; i++)
1732 bytes_by_char_head[i] = 1;
1733 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1734 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1735 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1736 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1738 for (i = 0; i < 128; i++)
1739 width_by_char_head[i] = 1;
1740 for (; i < 256; i++)
1741 width_by_char_head[i] = 4;
1742 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1743 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1744 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1745 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1748 Lisp_Object val;
1750 val = Qnil;
1751 for (i = 0x81; i < 0x90; i++)
1752 val = Fcons (make_number ((i - 0x70) << 7), val);
1753 for (; i < 0x9A; i++)
1754 val = Fcons (make_number ((i - 0x8F) << 14), val);
1755 for (i = 0xA0; i < 0xF0; i++)
1756 val = Fcons (make_number ((i - 0x70) << 7), val);
1757 for (; i < 0xFF; i++)
1758 val = Fcons (make_number ((i - 0xE0) << 14), val);
1759 Vgeneric_character_list = Fnreverse (val);
1762 nonascii_insert_offset = 0;
1763 Vnonascii_translation_table = Qnil;
1766 #ifdef emacs
1768 void
1769 syms_of_charset ()
1771 Qcharset = intern ("charset");
1772 staticpro (&Qcharset);
1774 Qascii = intern ("ascii");
1775 staticpro (&Qascii);
1777 Qeight_bit_control = intern ("eight-bit-control");
1778 staticpro (&Qeight_bit_control);
1780 Qeight_bit_graphic = intern ("eight-bit-graphic");
1781 staticpro (&Qeight_bit_graphic);
1783 /* Define special charsets ascii, eight-bit-control, and
1784 eight-bit-graphic. */
1785 update_charset_table (make_number (CHARSET_ASCII),
1786 make_number (1), make_number (94),
1787 make_number (1),
1788 make_number (0),
1789 make_number ('B'),
1790 make_number (0),
1791 build_string ("ASCII"),
1792 Qnil, /* same as above */
1793 build_string ("ASCII (ISO646 IRV)"));
1794 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1795 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1797 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1798 make_number (1), make_number (96),
1799 make_number (4),
1800 make_number (0),
1801 make_number (-1),
1802 make_number (-1),
1803 build_string ("8-bit control code (0x80..0x9F)"),
1804 Qnil, /* same as above */
1805 Qnil); /* same as above */
1806 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1807 Fput (Qeight_bit_control, Qcharset,
1808 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1810 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1811 make_number (1), make_number (96),
1812 make_number (4),
1813 make_number (0),
1814 make_number (-1),
1815 make_number (-1),
1816 build_string ("8-bit graphic char (0xA0..0xFF)"),
1817 Qnil, /* same as above */
1818 Qnil); /* same as above */
1819 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1820 Fput (Qeight_bit_graphic, Qcharset,
1821 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1823 Qauto_fill_chars = intern ("auto-fill-chars");
1824 staticpro (&Qauto_fill_chars);
1825 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1827 defsubr (&Sdefine_charset);
1828 defsubr (&Sgeneric_character_list);
1829 defsubr (&Sget_unused_iso_final_char);
1830 defsubr (&Sdeclare_equiv_charset);
1831 defsubr (&Sfind_charset_region);
1832 defsubr (&Sfind_charset_string);
1833 defsubr (&Smake_char_internal);
1834 defsubr (&Ssplit_char);
1835 defsubr (&Schar_charset);
1836 defsubr (&Scharset_after);
1837 defsubr (&Siso_charset);
1838 defsubr (&Schar_valid_p);
1839 defsubr (&Sunibyte_char_to_multibyte);
1840 defsubr (&Smultibyte_char_to_unibyte);
1841 defsubr (&Schar_bytes);
1842 defsubr (&Schar_width);
1843 defsubr (&Sstring_width);
1844 defsubr (&Schar_direction);
1845 defsubr (&Schars_in_region);
1846 defsubr (&Sstring);
1847 defsubr (&Ssetup_special_charsets);
1849 DEFVAR_LISP ("charset-list", &Vcharset_list,
1850 "List of charsets ever defined.");
1851 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1852 Fcons (Qeight_bit_graphic, Qnil)));
1854 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1855 "Vector of cons cell of a symbol and translation table ever defined.\n\
1856 An ID of a translation table is an index of this vector.");
1857 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1859 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1860 "Leading-code of private TYPE9N charset of column-width 1.");
1861 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1863 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1864 "Leading-code of private TYPE9N charset of column-width 2.");
1865 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1867 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1868 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1869 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1871 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1872 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1873 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1875 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1876 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1877 This is used for converting unibyte text to multibyte,\n\
1878 and for inserting character codes specified by number.\n\n\
1879 This serves to convert a Latin-1 or similar 8-bit character code\n\
1880 to the corresponding Emacs multibyte character code.\n\
1881 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1882 for your choice of character set.\n\
1883 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1884 nonascii_insert_offset = 0;
1886 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1887 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1888 This is used for converting unibyte text to multibyte,\n\
1889 and for inserting character codes specified by number.\n\n\
1890 Conversion is performed only when multibyte characters are enabled,\n\
1891 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1892 to the corresponding Emacs character code.\n\n\
1893 If this is nil, `nonascii-insert-offset' is used instead.\n\
1894 See also the docstring of `make-translation-table'.");
1895 Vnonascii_translation_table = Qnil;
1897 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1898 "A char-table for characters which invoke auto-filling.\n\
1899 Such characters have value t in this table.");
1900 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1901 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1902 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1905 #endif /* emacs */