Clarify what's the problem with KDE's klipper applet.
[emacs.git] / src / charset.c
blobfa355fc21bdda5fddba67d7daa801461ef021305
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
25 #ifdef emacs
26 #include <config.h>
27 #endif
29 #include <stdio.h>
31 #ifdef emacs
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "composite.h"
38 #include "coding.h"
39 #include "disptab.h"
41 #else /* not emacs */
43 #include "mulelib.h"
45 #endif /* emacs */
47 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
48 Lisp_Object Qunknown;
50 /* Declaration of special leading-codes. */
51 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
52 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
53 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
54 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
56 /* Declaration of special charsets. The values are set by
57 Fsetup_special_charsets. */
58 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
59 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
60 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
61 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
62 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
63 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
64 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
66 Lisp_Object Qcharset_table;
68 /* A char-table containing information of each character set. */
69 Lisp_Object Vcharset_table;
71 /* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73 Lisp_Object Vcharset_symbol_table;
75 /* A list of charset symbols ever defined. */
76 Lisp_Object Vcharset_list;
78 /* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80 Lisp_Object Vtranslation_table_vector;
82 /* A char-table for characters which may invoke auto-filling. */
83 Lisp_Object Vauto_fill_chars;
85 Lisp_Object Qauto_fill_chars;
87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88 int bytes_by_char_head[256];
89 int width_by_char_head[256];
91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93 int iso_charset_table[2][2][128];
95 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
96 unsigned char *_fetch_multibyte_char_p;
97 int _fetch_multibyte_char_len;
99 /* Offset to add to a non-ASCII value when inserting it. */
100 int nonascii_insert_offset;
102 /* Translation table for converting non-ASCII unibyte characters
103 to multibyte codes, or nil. */
104 Lisp_Object Vnonascii_translation_table;
106 /* List of all possible generic characters. */
107 Lisp_Object Vgeneric_character_list;
109 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
110 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
112 void
113 invalid_character (c)
114 int c;
116 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
119 /* Parse string STR of length LENGTH and fetch information of a
120 character at STR. Set BYTES to the byte length the character
121 occupies, CHARSET, C1, C2 to proper values of the character. */
123 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
124 do { \
125 (c1) = *(str); \
126 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
127 if ((bytes) == 1) \
128 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
129 else if ((bytes) == 2) \
131 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
132 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
133 else \
134 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
136 else if ((bytes) == 3) \
138 if ((c1) < LEADING_CODE_PRIVATE_11) \
139 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
140 else \
141 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
143 else \
144 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
145 } while (0)
147 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
148 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
149 ((charset) == CHARSET_ASCII \
150 ? ((c1) >= 0 && (c1) <= 0x7F) \
151 : ((charset) == CHARSET_8_BIT_CONTROL \
152 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
153 : ((charset) == CHARSET_8_BIT_GRAPHIC \
154 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
155 : (CHARSET_DIMENSION (charset) == 1 \
156 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
157 : ((c1) >= 0x20 && (c1) <= 0x7F \
158 && (c2) >= 0x20 && (c2) <= 0x7F)))))
160 /* Store multi-byte form of the character C in STR. The caller should
161 allocate at least 4-byte area at STR in advance. Returns the
162 length of the multi-byte form. If C is an invalid character code,
163 signal an error.
165 Use macro `CHAR_STRING (C, STR)' instead of calling this function
166 directly if C can be an ASCII character. */
169 char_to_string (c, str)
170 int c;
171 unsigned char *str;
173 unsigned char *p = str;
175 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
177 /* Multibyte character can't have a modifier bit. */
178 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
179 invalid_character (c);
181 /* For Meta, Shift, and Control modifiers, we need special care. */
182 if (c & CHAR_META)
184 /* Move the meta bit to the right place for a string. */
185 c = (c & ~CHAR_META) | 0x80;
187 if (c & CHAR_SHIFT)
189 /* Shift modifier is valid only with [A-Za-z]. */
190 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
191 c &= ~CHAR_SHIFT;
192 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
193 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
195 if (c & CHAR_CTL)
197 /* Simulate the code in lread.c. */
198 /* Allow `\C- ' and `\C-?'. */
199 if (c == (CHAR_CTL | ' '))
200 c = 0;
201 else if (c == (CHAR_CTL | '?'))
202 c = 127;
203 /* ASCII control chars are made from letters (both cases),
204 as well as the non-letters within 0100...0137. */
205 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
206 c &= (037 | (~0177 & ~CHAR_CTL));
207 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
208 c &= (037 | (~0177 & ~CHAR_CTL));
211 /* If C still has any modifier bits, just ignore it. */
212 c &= ~CHAR_MODIFIER_MASK;
214 if (SINGLE_BYTE_CHAR_P (c))
216 if (ASCII_BYTE_P (c) || c >= 0xA0)
217 *p++ = c;
218 else
220 *p++ = LEADING_CODE_8_BIT_CONTROL;
221 *p++ = c + 0x20;
224 else if (CHAR_VALID_P (c, 0))
226 int charset, c1, c2;
228 SPLIT_CHAR (c, charset, c1, c2);
230 if (charset >= LEADING_CODE_EXT_11)
231 *p++ = (charset < LEADING_CODE_EXT_12
232 ? LEADING_CODE_PRIVATE_11
233 : (charset < LEADING_CODE_EXT_21
234 ? LEADING_CODE_PRIVATE_12
235 : (charset < LEADING_CODE_EXT_22
236 ? LEADING_CODE_PRIVATE_21
237 : LEADING_CODE_PRIVATE_22)));
238 *p++ = charset;
239 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
240 invalid_character (c);
241 if (c1)
243 *p++ = c1 | 0x80;
244 if (c2 > 0)
245 *p++ = c2 | 0x80;
248 else
249 invalid_character (c);
251 return (p - str);
254 /* Return the non-ASCII character corresponding to multi-byte form at
255 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
256 length of the multibyte form in *ACTUAL_LEN.
258 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
259 this function directly if you want ot handle ASCII characters as
260 well. */
263 string_to_char (str, len, actual_len)
264 const unsigned char *str;
265 int len, *actual_len;
267 int c, bytes, charset, c1, c2;
269 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
270 c = MAKE_CHAR (charset, c1, c2);
271 if (actual_len)
272 *actual_len = bytes;
273 return c;
276 /* Return the length of the multi-byte form at string STR of length LEN.
277 Use the macro MULTIBYTE_FORM_LENGTH instead. */
279 multibyte_form_length (str, len)
280 const unsigned char *str;
281 int len;
283 int bytes;
285 PARSE_MULTIBYTE_SEQ (str, len, bytes);
286 return bytes;
289 /* Check multibyte form at string STR of length LEN and set variables
290 pointed by CHARSET, C1, and C2 to charset and position codes of the
291 character at STR, and return 0. If there's no multibyte character,
292 return -1. This should be used only in the macro SPLIT_STRING
293 which checks range of STR in advance. */
296 split_string (str, len, charset, c1, c2)
297 const unsigned char *str;
298 unsigned char *c1, *c2;
299 int len, *charset;
301 register int bytes, cs, code1, code2 = -1;
303 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
304 if (cs == CHARSET_ASCII)
305 return -1;
306 *charset = cs;
307 *c1 = code1;
308 *c2 = code2;
309 return 0;
312 /* Return 1 iff character C has valid printable glyph.
313 Use the macro CHAR_PRINTABLE_P instead. */
315 char_printable_p (c)
316 int c;
318 int charset, c1, c2;
320 if (ASCII_BYTE_P (c))
321 return 1;
322 else if (SINGLE_BYTE_CHAR_P (c))
323 return 0;
324 else if (c >= MAX_CHAR)
325 return 0;
327 SPLIT_CHAR (c, charset, c1, c2);
328 if (! CHARSET_DEFINED_P (charset))
329 return 0;
330 if (CHARSET_CHARS (charset) == 94
331 ? c1 <= 32 || c1 >= 127
332 : c1 < 32)
333 return 0;
334 if (CHARSET_DIMENSION (charset) == 2
335 && (CHARSET_CHARS (charset) == 94
336 ? c2 <= 32 || c2 >= 127
337 : c2 < 32))
338 return 0;
339 return 1;
342 /* Translate character C by translation table TABLE. If C
343 is negative, translate a character specified by CHARSET, C1, and C2
344 (C1 and C2 are code points of the character). If no translation is
345 found in TABLE, return C. */
347 translate_char (table, c, charset, c1, c2)
348 Lisp_Object table;
349 int c, charset, c1, c2;
351 Lisp_Object ch;
352 int alt_charset, alt_c1, alt_c2, dimension;
354 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
355 if (!CHAR_TABLE_P (table)
356 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
357 return c;
359 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
360 dimension = CHARSET_DIMENSION (alt_charset);
361 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
362 /* CH is not a generic character, just return it. */
363 return XFASTINT (ch);
365 /* Since CH is a generic character, we must return a specific
366 charater which has the same position codes as C from CH. */
367 if (charset < 0)
368 SPLIT_CHAR (c, charset, c1, c2);
369 if (dimension != CHARSET_DIMENSION (charset))
370 /* We can't make such a character because of dimension mismatch. */
371 return c;
372 return MAKE_CHAR (alt_charset, c1, c2);
375 /* Convert the unibyte character C to multibyte based on
376 Vnonascii_translation_table or nonascii_insert_offset. If they can't
377 convert C to a valid multibyte character, convert it based on
378 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
381 unibyte_char_to_multibyte (c)
382 int c;
384 if (c < 0400 && c >= 0200)
386 int c_save = c;
388 if (! NILP (Vnonascii_translation_table))
390 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
391 if (c >= 0400 && ! char_valid_p (c, 0))
392 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
394 else if (c >= 0240 && nonascii_insert_offset > 0)
396 c += nonascii_insert_offset;
397 if (c < 0400 || ! char_valid_p (c, 0))
398 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
400 else if (c >= 0240)
401 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
403 return c;
407 /* Convert the multibyte character C to unibyte 8-bit character based
408 on Vnonascii_translation_table or nonascii_insert_offset. If
409 REV_TBL is non-nil, it should be a reverse table of
410 Vnonascii_translation_table, i.e. what given by:
411 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
414 multibyte_char_to_unibyte (c, rev_tbl)
415 int c;
416 Lisp_Object rev_tbl;
418 if (!SINGLE_BYTE_CHAR_P (c))
420 int c_save = c;
422 if (! CHAR_TABLE_P (rev_tbl)
423 && CHAR_TABLE_P (Vnonascii_translation_table))
424 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
425 make_number (0));
426 if (CHAR_TABLE_P (rev_tbl))
428 Lisp_Object temp;
429 temp = Faref (rev_tbl, make_number (c));
430 if (INTEGERP (temp))
431 c = XINT (temp);
432 if (c >= 256)
433 c = (c_save & 0177) + 0200;
435 else
437 if (nonascii_insert_offset > 0)
438 c -= nonascii_insert_offset;
439 if (c < 128 || c >= 256)
440 c = (c_save & 0177) + 0200;
444 return c;
448 /* Update the table Vcharset_table with the given arguments (see the
449 document of `define-charset' for the meaning of each argument).
450 Several other table contents are also updated. The caller should
451 check the validity of CHARSET-ID and the remaining arguments in
452 advance. */
454 void
455 update_charset_table (charset_id, dimension, chars, width, direction,
456 iso_final_char, iso_graphic_plane,
457 short_name, long_name, description)
458 Lisp_Object charset_id, dimension, chars, width, direction;
459 Lisp_Object iso_final_char, iso_graphic_plane;
460 Lisp_Object short_name, long_name, description;
462 int charset = XINT (charset_id);
463 int bytes;
464 unsigned char leading_code_base, leading_code_ext;
466 if (NILP (CHARSET_TABLE_ENTRY (charset)))
467 CHARSET_TABLE_ENTRY (charset)
468 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
470 if (NILP (long_name))
471 long_name = short_name;
472 if (NILP (description))
473 description = long_name;
475 /* Get byte length of multibyte form, base leading-code, and
476 extended leading-code of the charset. See the comment under the
477 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
478 bytes = XINT (dimension);
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
481 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
483 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset;
485 leading_code_ext = 0;
487 else
489 /* Private charset. */
490 bytes += 2; /* For base and extended leading-codes. */
491 leading_code_base
492 = (charset < LEADING_CODE_EXT_12
493 ? LEADING_CODE_PRIVATE_11
494 : (charset < LEADING_CODE_EXT_21
495 ? LEADING_CODE_PRIVATE_12
496 : (charset < LEADING_CODE_EXT_22
497 ? LEADING_CODE_PRIVATE_21
498 : LEADING_CODE_PRIVATE_22)));
499 leading_code_ext = charset;
500 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
501 error ("Invalid dimension for the charset-ID %d", charset);
504 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
505 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
506 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
507 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
508 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
509 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
510 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
511 = make_number (leading_code_base);
512 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
513 = make_number (leading_code_ext);
514 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
516 = iso_graphic_plane;
517 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
518 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
520 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
523 /* If we have already defined a charset which has the same
524 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
525 DIRECTION, we must update the entry REVERSE-CHARSET of both
526 charsets. If there's no such charset, the value of the entry
527 is set to nil. */
528 int i;
530 for (i = 0; i <= MAX_CHARSET; i++)
531 if (!NILP (CHARSET_TABLE_ENTRY (i)))
533 if (CHARSET_DIMENSION (i) == XINT (dimension)
534 && CHARSET_CHARS (i) == XINT (chars)
535 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
536 && CHARSET_DIRECTION (i) != XINT (direction))
538 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
539 = make_number (i);
540 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
541 break;
544 if (i > MAX_CHARSET)
545 /* No such a charset. */
546 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
547 = make_number (-1);
550 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
551 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 bytes_by_char_head[leading_code_base] = bytes;
554 width_by_char_head[leading_code_base] = XINT (width);
556 /* Update table emacs_code_class. */
557 emacs_code_class[charset] = (bytes == 2
558 ? EMACS_leading_code_2
559 : (bytes == 3
560 ? EMACS_leading_code_3
561 : EMACS_leading_code_4));
564 /* Update table iso_charset_table. */
565 if (XINT (iso_final_char) >= 0
566 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
567 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
570 #ifdef emacs
572 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
573 is invalid. */
575 get_charset_id (charset_symbol)
576 Lisp_Object charset_symbol;
578 Lisp_Object val;
579 int charset;
581 return ((SYMBOLP (charset_symbol)
582 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
583 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
584 CHARSET_VALID_P (charset)))
585 ? charset : -1);
588 /* Return an identification number for a new private charset of
589 DIMENSION and WIDTH. If there's no more room for the new charset,
590 return 0. */
591 Lisp_Object
592 get_new_private_charset_id (dimension, width)
593 int dimension, width;
595 int charset, from, to;
597 if (dimension == 1)
599 from = LEADING_CODE_EXT_11;
600 to = LEADING_CODE_EXT_21;
602 else
604 from = LEADING_CODE_EXT_21;
605 to = LEADING_CODE_EXT_MAX + 1;
608 for (charset = from; charset < to; charset++)
609 if (!CHARSET_DEFINED_P (charset)) break;
611 return make_number (charset < to ? charset : 0);
614 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
615 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
616 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
617 treated as a private charset.\n\
618 INFO-VECTOR is a vector of the format:\n\
619 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
620 SHORT-NAME LONG-NAME DESCRIPTION]\n\
621 The meanings of each elements is as follows:\n\
622 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
623 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
624 WIDTH (integer) is the number of columns a character in the charset\n\
625 occupies on the screen: one of 0, 1, and 2.\n\
627 DIRECTION (integer) is the rendering direction of characters in the\n\
628 charset when rendering. If 0, render from left to right, else\n\
629 render from right to left.\n\
631 ISO-FINAL-CHAR (character) is the final character of the\n\
632 corresponding ISO 2022 charset.\n\
633 It may be -1 if the charset is internal use only.\n\
635 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
636 while encoding to variants of ISO 2022 coding system, one of the\n\
637 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
638 It may be -1 if the charset is internal use only.\n\
640 SHORT-NAME (string) is the short name to refer to the charset.\n\
642 LONG-NAME (string) is the long name to refer to the charset.\n\
644 DESCRIPTION (string) is the description string of the charset.")
645 (charset_id, charset_symbol, info_vector)
646 Lisp_Object charset_id, charset_symbol, info_vector;
648 Lisp_Object *vec;
650 if (!NILP (charset_id))
651 CHECK_NUMBER (charset_id, 0);
652 CHECK_SYMBOL (charset_symbol, 1);
653 CHECK_VECTOR (info_vector, 2);
655 if (! NILP (charset_id))
657 if (! CHARSET_VALID_P (XINT (charset_id)))
658 error ("Invalid CHARSET: %d", XINT (charset_id));
659 else if (CHARSET_DEFINED_P (XINT (charset_id)))
660 error ("Already defined charset: %d", XINT (charset_id));
663 vec = XVECTOR (info_vector)->contents;
664 if (XVECTOR (info_vector)->size != 9
665 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
666 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
667 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
668 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
669 || !INTEGERP (vec[4])
670 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5])
672 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
673 || !STRINGP (vec[6])
674 || !STRINGP (vec[7])
675 || !STRINGP (vec[8]))
676 error ("Invalid info-vector argument for defining charset %s",
677 XSYMBOL (charset_symbol)->name->data);
679 if (NILP (charset_id))
681 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
682 if (XINT (charset_id) == 0)
683 error ("There's no room for a new private charset %s",
684 XSYMBOL (charset_symbol)->name->data);
687 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
688 vec[4], vec[5], vec[6], vec[7], vec[8]);
689 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
690 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
691 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
692 return Qnil;
695 DEFUN ("generic-character-list", Fgeneric_character_list,
696 Sgeneric_character_list, 0, 0, 0,
697 "Return a list of all possible generic characters.\n\
698 It includes a generic character for a charset not yet defined.")
701 return Vgeneric_character_list;
704 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
705 Sget_unused_iso_final_char, 2, 2, 0,
706 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
707 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
708 CHARS is the number of characters in a dimension: 94 or 96.\n\
710 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
711 If there's no unused final char for the specified kind of charset,\n\
712 return nil.")
713 (dimension, chars)
714 Lisp_Object dimension, chars;
716 int final_char;
718 CHECK_NUMBER (dimension, 0);
719 CHECK_NUMBER (chars, 1);
720 if (XINT (dimension) != 1 && XINT (dimension) != 2)
721 error ("Invalid charset dimension %d, it should be 1 or 2",
722 XINT (dimension));
723 if (XINT (chars) != 94 && XINT (chars) != 96)
724 error ("Invalid charset chars %d, it should be 94 or 96",
725 XINT (chars));
726 for (final_char = '0'; final_char <= '?'; final_char++)
728 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
729 break;
731 return (final_char <= '?' ? make_number (final_char) : Qnil);
734 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
735 4, 4, 0,
736 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
737 CHARSET should be defined by `defined-charset' in advance.")
738 (dimension, chars, final_char, charset_symbol)
739 Lisp_Object dimension, chars, final_char, charset_symbol;
741 int charset;
743 CHECK_NUMBER (dimension, 0);
744 CHECK_NUMBER (chars, 1);
745 CHECK_NUMBER (final_char, 2);
746 CHECK_SYMBOL (charset_symbol, 3);
748 if (XINT (dimension) != 1 && XINT (dimension) != 2)
749 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
750 if (XINT (chars) != 94 && XINT (chars) != 96)
751 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
752 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
753 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
754 if ((charset = get_charset_id (charset_symbol)) < 0)
755 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
757 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
758 return Qnil;
761 /* Return information about charsets in the text at PTR of NBYTES
762 bytes, which are NCHARS characters. The value is:
764 0: Each character is represented by one byte. This is always
765 true for unibyte text.
766 1: No charsets other than ascii eight-bit-control,
767 eight-bit-graphic, and latin-1 are found.
768 2: Otherwise.
770 In addition, if CHARSETS is nonzero, for each found charset N, set
771 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
772 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
773 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
774 1 (note that there's no charset whose ID is 1). */
777 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
778 unsigned char *ptr;
779 int nchars, nbytes, *charsets;
780 Lisp_Object table;
782 if (nchars == nbytes)
784 if (charsets && nbytes > 0)
786 unsigned char *endp = ptr + nbytes;
787 int maskbits = 0;
789 while (ptr < endp && maskbits != 7)
791 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
792 ptr++;
795 if (maskbits & 1)
796 charsets[CHARSET_ASCII] = 1;
797 if (maskbits & 2)
798 charsets[CHARSET_8_BIT_CONTROL] = 1;
799 if (maskbits & 4)
800 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
802 return 0;
804 else
806 int return_val = 1;
807 int bytes, charset, c1, c2;
809 if (! CHAR_TABLE_P (table))
810 table = Qnil;
812 while (nchars-- > 0)
814 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
815 ptr += bytes;
817 if (!CHARSET_DEFINED_P (charset))
818 charset = 1;
819 else if (! NILP (table))
821 int c = translate_char (table, -1, charset, c1, c2);
822 if (c >= 0)
823 charset = CHAR_CHARSET (c);
826 if (return_val == 1
827 && charset != CHARSET_ASCII
828 && charset != CHARSET_8_BIT_CONTROL
829 && charset != CHARSET_8_BIT_GRAPHIC
830 && charset != charset_latin_iso8859_1)
831 return_val = 2;
833 if (charsets)
834 charsets[charset] = 1;
835 else if (return_val == 2)
836 break;
838 return return_val;
842 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
843 2, 3, 0,
844 "Return a list of charsets in the region between BEG and END.\n\
845 BEG and END are buffer positions.\n\
846 Optional arg TABLE if non-nil is a translation table to look up.\n\
848 If the region contains invalid multibyte characters,\n\
849 `unknown' is included in the returned list.\n\
851 If the current buffer is unibyte, the returned list may contain\n\
852 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
853 (beg, end, table)
854 Lisp_Object beg, end, table;
856 int charsets[MAX_CHARSET + 1];
857 int from, from_byte, to, stop, stop_byte, i;
858 Lisp_Object val;
860 validate_region (&beg, &end);
861 from = XFASTINT (beg);
862 stop = to = XFASTINT (end);
864 if (from < GPT && GPT < to)
866 stop = GPT;
867 stop_byte = GPT_BYTE;
869 else
870 stop_byte = CHAR_TO_BYTE (stop);
872 from_byte = CHAR_TO_BYTE (from);
874 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
875 while (1)
877 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
878 stop_byte - from_byte, charsets, table);
879 if (stop < to)
881 from = stop, from_byte = stop_byte;
882 stop = to, stop_byte = CHAR_TO_BYTE (stop);
884 else
885 break;
888 val = Qnil;
889 if (charsets[1])
890 val = Fcons (Qunknown, val);
891 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
892 if (charsets[i])
893 val = Fcons (CHARSET_SYMBOL (i), val);
894 if (charsets[0])
895 val = Fcons (Qascii, val);
896 return val;
899 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
900 1, 2, 0,
901 "Return a list of charsets in STR.\n\
902 Optional arg TABLE if non-nil is a translation table to look up.\n\
904 If the string contains invalid multibyte characters,\n\
905 `unknown' is included in the returned list.\n\
907 If STR is unibyte, the returned list may contain\n\
908 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
909 (str, table)
910 Lisp_Object str, table;
912 int charsets[MAX_CHARSET + 1];
913 int i;
914 Lisp_Object val;
916 CHECK_STRING (str, 0);
918 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
919 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
920 STRING_BYTES (XSTRING (str)), charsets, table);
922 val = Qnil;
923 if (charsets[1])
924 val = Fcons (Qunknown, val);
925 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
926 if (charsets[i])
927 val = Fcons (CHARSET_SYMBOL (i), val);
928 if (charsets[0])
929 val = Fcons (Qascii, val);
930 return val;
934 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
936 (charset, code1, code2)
937 Lisp_Object charset, code1, code2;
939 int charset_id, c1, c2;
941 CHECK_NUMBER (charset, 0);
942 charset_id = XINT (charset);
943 if (!CHARSET_DEFINED_P (charset_id))
944 error ("Invalid charset ID: %d", XINT (charset));
946 if (NILP (code1))
947 c1 = 0;
948 else
950 CHECK_NUMBER (code1, 1);
951 c1 = XINT (code1);
953 if (NILP (code2))
954 c2 = 0;
955 else
957 CHECK_NUMBER (code2, 2);
958 c2 = XINT (code2);
961 if (charset_id == CHARSET_ASCII)
963 if (c1 < 0 || c1 > 0x7F)
964 goto invalid_code_posints;
965 return make_number (c1);
967 else if (charset_id == CHARSET_8_BIT_CONTROL)
969 if (NILP (code1))
970 c1 = 0x80;
971 else if (c1 < 0x80 || c1 > 0x9F)
972 goto invalid_code_posints;
973 return make_number (c1);
975 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
977 if (NILP (code1))
978 c1 = 0xA0;
979 else if (c1 < 0xA0 || c1 > 0xFF)
980 goto invalid_code_posints;
981 return make_number (c1);
983 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
984 goto invalid_code_posints;
985 c1 &= 0x7F;
986 c2 &= 0x7F;
987 if (c1 == 0
988 ? c2 != 0
989 : (c2 == 0
990 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
991 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
992 goto invalid_code_posints;
993 return make_number (MAKE_CHAR (charset_id, c1, c2));
995 invalid_code_posints:
996 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
999 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1000 "Return list of charset and one or two position-codes of CHAR.\n\
1001 If CHAR is invalid as a character code,\n\
1002 return a list of symbol `unknown' and CHAR.")
1003 (ch)
1004 Lisp_Object ch;
1006 int c, charset, c1, c2;
1008 CHECK_NUMBER (ch, 0);
1009 c = XFASTINT (ch);
1010 if (!CHAR_VALID_P (c, 1))
1011 return Fcons (Qunknown, Fcons (ch, Qnil));
1012 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1013 return (c2 >= 0
1014 ? Fcons (CHARSET_SYMBOL (charset),
1015 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1016 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1019 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1020 "Return charset of CHAR.")
1021 (ch)
1022 Lisp_Object ch;
1024 CHECK_NUMBER (ch, 0);
1026 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1029 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1030 "Return charset of a character in the current buffer at position POS.\n\
1031 If POS is nil, it defauls to the current point.\n\
1032 If POS is out of range, the value is nil.")
1033 (pos)
1034 Lisp_Object pos;
1036 Lisp_Object ch;
1037 int charset;
1039 ch = Fchar_after (pos);
1040 if (! INTEGERP (ch))
1041 return ch;
1042 charset = CHAR_CHARSET (XINT (ch));
1043 return CHARSET_SYMBOL (charset);
1046 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1047 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1049 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1050 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1051 where as Emacs distinguishes them by charset symbol.\n\
1052 See the documentation of the function `charset-info' for the meanings of\n\
1053 DIMENSION, CHARS, and FINAL-CHAR.")
1054 (dimension, chars, final_char)
1055 Lisp_Object dimension, chars, final_char;
1057 int charset;
1059 CHECK_NUMBER (dimension, 0);
1060 CHECK_NUMBER (chars, 1);
1061 CHECK_NUMBER (final_char, 2);
1063 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1064 return Qnil;
1065 return CHARSET_SYMBOL (charset);
1068 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1069 generic character. If GENERICP is zero, return nonzero iff C is a
1070 valid normal character. Do not call this function directly,
1071 instead use macro CHAR_VALID_P. */
1073 char_valid_p (c, genericp)
1074 int c, genericp;
1076 int charset, c1, c2;
1078 if (c < 0 || c >= MAX_CHAR)
1079 return 0;
1080 if (SINGLE_BYTE_CHAR_P (c))
1081 return 1;
1082 SPLIT_CHAR (c, charset, c1, c2);
1083 if (genericp)
1085 if (c1)
1087 if (c2 <= 0) c2 = 0x20;
1089 else
1091 if (c2 <= 0) c1 = c2 = 0x20;
1094 return (CHARSET_DEFINED_P (charset)
1095 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1098 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1099 "Return t if OBJECT is a valid normal character.\n\
1100 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1101 a valid generic character.")
1102 (object, genericp)
1103 Lisp_Object object, genericp;
1105 if (! NATNUMP (object))
1106 return Qnil;
1107 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1110 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1111 Sunibyte_char_to_multibyte, 1, 1, 0,
1112 "Convert the unibyte character CH to multibyte character.\n\
1113 The conversion is done based on `nonascii-translation-table' (which see)\n\
1114 or `nonascii-insert-offset' (which see).")
1115 (ch)
1116 Lisp_Object ch;
1118 int c;
1120 CHECK_NUMBER (ch, 0);
1121 c = XINT (ch);
1122 if (c < 0 || c >= 0400)
1123 error ("Invalid unibyte character: %d", c);
1124 c = unibyte_char_to_multibyte (c);
1125 if (c < 0)
1126 error ("Can't convert to multibyte character: %d", XINT (ch));
1127 return make_number (c);
1130 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1131 Smultibyte_char_to_unibyte, 1, 1, 0,
1132 "Convert the multibyte character CH to unibyte character.\n\
1133 The conversion is done based on `nonascii-translation-table' (which see)\n\
1134 or `nonascii-insert-offset' (which see).")
1135 (ch)
1136 Lisp_Object ch;
1138 int c;
1140 CHECK_NUMBER (ch, 0);
1141 c = XINT (ch);
1142 if (! CHAR_VALID_P (c, 0))
1143 error ("Invalid multibyte character: %d", c);
1144 c = multibyte_char_to_unibyte (c, Qnil);
1145 if (c < 0)
1146 error ("Can't convert to unibyte character: %d", XINT (ch));
1147 return make_number (c);
1150 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1151 "Return 1 regardless of the argument CHAR.\n\
1152 This is now an obsolete function. We keep it just for backward compatibility.")
1153 (ch)
1154 Lisp_Object ch;
1156 CHECK_NUMBER (ch, 0);
1157 return make_number (1);
1160 /* Return how many bytes C will occupy in a multibyte buffer.
1161 Don't call this function directly, instead use macro CHAR_BYTES. */
1163 char_bytes (c)
1164 int c;
1166 int charset;
1168 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1169 return 1;
1170 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1171 return 1;
1173 charset = CHAR_CHARSET (c);
1174 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1177 /* Return the width of character of which multi-byte form starts with
1178 C. The width is measured by how many columns occupied on the
1179 screen when displayed in the current buffer. */
1181 #define ONE_BYTE_CHAR_WIDTH(c) \
1182 (c < 0x20 \
1183 ? (c == '\t' \
1184 ? XFASTINT (current_buffer->tab_width) \
1185 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1186 : (c < 0x7f \
1187 ? 1 \
1188 : (c == 0x7F \
1189 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1190 : ((! NILP (current_buffer->enable_multibyte_characters) \
1191 && BASE_LEADING_CODE_P (c)) \
1192 ? WIDTH_BY_CHAR_HEAD (c) \
1193 : 4))))
1195 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1196 "Return width of CHAR when displayed in the current buffer.\n\
1197 The width is measured by how many columns it occupies on the screen.\n\
1198 Tab is taken to occupy `tab-width' columns.")
1199 (ch)
1200 Lisp_Object ch;
1202 Lisp_Object val, disp;
1203 int c;
1204 struct Lisp_Char_Table *dp = buffer_display_table ();
1206 CHECK_NUMBER (ch, 0);
1208 c = XINT (ch);
1210 /* Get the way the display table would display it. */
1211 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1213 if (VECTORP (disp))
1214 XSETINT (val, XVECTOR (disp)->size);
1215 else if (SINGLE_BYTE_CHAR_P (c))
1216 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1217 else
1219 int charset = CHAR_CHARSET (c);
1221 XSETFASTINT (val, CHARSET_WIDTH (charset));
1223 return val;
1226 /* Return width of string STR of length LEN when displayed in the
1227 current buffer. The width is measured by how many columns it
1228 occupies on the screen. */
1231 strwidth (str, len)
1232 unsigned char *str;
1233 int len;
1235 return c_string_width (str, len, -1, NULL, NULL);
1238 /* Return width of string STR of length LEN when displayed in the
1239 current buffer. The width is measured by how many columns it
1240 occupies on the screen. If PRECISION > 0, return the width of
1241 longest substring that doesn't exceed PRECISION, and set number of
1242 characters and bytes of the substring in *NCHARS and *NBYTES
1243 respectively. */
1246 c_string_width (str, len, precision, nchars, nbytes)
1247 unsigned char *str;
1248 int precision, *nchars, *nbytes;
1250 int i = 0, i_byte = 0;
1251 int width = 0;
1252 int chars;
1253 struct Lisp_Char_Table *dp = buffer_display_table ();
1255 while (i_byte < len)
1257 int bytes, thiswidth;
1258 Lisp_Object val;
1260 if (dp)
1262 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1264 chars = 1;
1265 val = DISP_CHAR_VECTOR (dp, c);
1266 if (VECTORP (val))
1267 thiswidth = XVECTOR (val)->size;
1268 else
1269 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1271 else
1273 chars = 1;
1274 PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes);
1275 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1278 if (precision > 0
1279 && (width + thiswidth > precision))
1281 *nchars = i;
1282 *nbytes = i_byte;
1283 return width;
1285 i++;
1286 i_byte += bytes;
1287 width += thiswidth;
1290 if (precision > 0)
1292 *nchars = i;
1293 *nbytes = i_byte;
1296 return width;
1299 /* Return width of Lisp string STRING when displayed in the current
1300 buffer. The width is measured by how many columns it occupies on
1301 the screen while paying attention to compositions. If PRECISION >
1302 0, return the width of longest substring that doesn't exceed
1303 PRECISION, and set number of characters and bytes of the substring
1304 in *NCHARS and *NBYTES respectively. */
1307 lisp_string_width (string, precision, nchars, nbytes)
1308 Lisp_Object string;
1309 int precision, *nchars, *nbytes;
1311 int len = XSTRING (string)->size;
1312 int len_byte = STRING_BYTES (XSTRING (string));
1313 unsigned char *str = XSTRING (string)->data;
1314 int i = 0, i_byte = 0;
1315 int width = 0;
1316 struct Lisp_Char_Table *dp = buffer_display_table ();
1318 while (i < len)
1320 int chars, bytes, thiswidth;
1321 Lisp_Object val;
1322 int cmp_id;
1323 int ignore, end;
1325 if (find_composition (i, -1, &ignore, &end, &val, string)
1326 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
1327 >= 0))
1329 thiswidth = composition_table[cmp_id]->width;
1330 chars = end - i;
1331 bytes = string_char_to_byte (string, end) - i_byte;
1333 else if (dp)
1335 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1337 chars = 1;
1338 val = DISP_CHAR_VECTOR (dp, c);
1339 if (VECTORP (val))
1340 thiswidth = XVECTOR (val)->size;
1341 else
1342 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1344 else
1346 chars = 1;
1347 PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes);
1348 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1351 if (precision > 0
1352 && (width + thiswidth > precision))
1354 *nchars = i;
1355 *nbytes = i_byte;
1356 return width;
1358 i += chars;
1359 i_byte += bytes;
1360 width += thiswidth;
1363 if (precision > 0)
1365 *nchars = i;
1366 *nbytes = i_byte;
1369 return width;
1372 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1373 "Return width of STRING when displayed in the current buffer.\n\
1374 Width is measured by how many columns it occupies on the screen.\n\
1375 When calculating width of a multibyte character in STRING,\n\
1376 only the base leading-code is considered; the validity of\n\
1377 the following bytes is not checked. Tabs in STRING are always\n\
1378 taken to occupy `tab-width' columns.")
1379 (str)
1380 Lisp_Object str;
1382 Lisp_Object val;
1384 CHECK_STRING (str, 0);
1385 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
1386 return val;
1389 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1390 "Return the direction of CHAR.\n\
1391 The returned value is 0 for left-to-right and 1 for right-to-left.")
1392 (ch)
1393 Lisp_Object ch;
1395 int charset;
1397 CHECK_NUMBER (ch, 0);
1398 charset = CHAR_CHARSET (XFASTINT (ch));
1399 if (!CHARSET_DEFINED_P (charset))
1400 invalid_character (XINT (ch));
1401 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1404 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1405 "Return number of characters between BEG and END.")
1406 (beg, end)
1407 Lisp_Object beg, end;
1409 int from, to;
1411 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1412 CHECK_NUMBER_COERCE_MARKER (end, 1);
1414 from = min (XFASTINT (beg), XFASTINT (end));
1415 to = max (XFASTINT (beg), XFASTINT (end));
1417 return make_number (to - from);
1420 /* Return the number of characters in the NBYTES bytes at PTR.
1421 This works by looking at the contents and checking for multibyte sequences.
1422 However, if the current buffer has enable-multibyte-characters = nil,
1423 we treat each byte as a character. */
1426 chars_in_text (ptr, nbytes)
1427 unsigned char *ptr;
1428 int nbytes;
1430 /* current_buffer is null at early stages of Emacs initialization. */
1431 if (current_buffer == 0
1432 || NILP (current_buffer->enable_multibyte_characters))
1433 return nbytes;
1435 return multibyte_chars_in_text (ptr, nbytes);
1438 /* Return the number of characters in the NBYTES bytes at PTR.
1439 This works by looking at the contents and checking for multibyte sequences.
1440 It ignores enable-multibyte-characters. */
1443 multibyte_chars_in_text (ptr, nbytes)
1444 unsigned char *ptr;
1445 int nbytes;
1447 unsigned char *endp;
1448 int chars, bytes;
1450 endp = ptr + nbytes;
1451 chars = 0;
1453 while (ptr < endp)
1455 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1456 ptr += bytes;
1457 chars++;
1460 return chars;
1463 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1464 count the numbers of characters and bytes in it. On counting
1465 bytes, pay attention to the fact that 8-bit characters in the range
1466 0x80..0x9F are represented by 2 bytes in multibyte text. */
1467 void
1468 parse_str_as_multibyte (str, len, nchars, nbytes)
1469 unsigned char *str;
1470 int len, *nchars, *nbytes;
1472 unsigned char *endp = str + len;
1473 int n, chars = 0, bytes = 0;
1475 while (str < endp)
1477 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1478 str += n, bytes += n;
1479 else
1480 str++, bytes += 2;
1481 chars++;
1483 *nchars = chars;
1484 *nbytes = bytes;
1485 return;
1488 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1489 It actually converts only 8-bit characters in the range 0x80..0x9F
1490 that don't contruct multibyte characters to multibyte forms. If
1491 NCHARS is nonzero, set *NCHARS to the number of characters in the
1492 text. It is assured that we can use LEN bytes at STR as a work
1493 area and that is enough. Return the number of bytes of the
1494 resulting text. */
1497 str_as_multibyte (str, len, nbytes, nchars)
1498 unsigned char *str;
1499 int len, nbytes, *nchars;
1501 unsigned char *p = str, *endp = str + nbytes;
1502 unsigned char *to;
1503 int chars = 0;
1504 int n;
1506 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1507 p += n, chars++;
1508 if (nchars)
1509 *nchars = chars;
1510 if (p == endp)
1511 return nbytes;
1513 to = p;
1514 nbytes = endp - p;
1515 endp = str + len;
1516 safe_bcopy (p, endp - nbytes, nbytes);
1517 p = endp - nbytes;
1518 while (p < endp)
1520 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1522 while (n--)
1523 *to++ = *p++;
1525 else
1527 *to++ = LEADING_CODE_8_BIT_CONTROL;
1528 *to++ = *p++ + 0x20;
1530 chars++;
1532 if (nchars)
1533 *nchars = chars;
1534 return (to - str);
1537 /* Parse unibyte string at STR of LEN bytes, and return the number of
1538 bytes it may ocupy when converted to multibyte string by
1539 `str_to_multibyte'. */
1542 parse_str_to_multibyte (str, len)
1543 unsigned char *str;
1544 int len;
1546 unsigned char *endp = str + len;
1547 int bytes;
1549 for (bytes = 0; str < endp; str++)
1550 bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2;
1551 return bytes;
1554 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1555 that contains the same single-byte characters. It actually
1556 converts all 8-bit characters to multibyte forms. It is assured
1557 that we can use LEN bytes at STR as a work area and that is
1558 enough. */
1561 str_to_multibyte (str, len, bytes)
1562 unsigned char *str;
1563 int len, bytes;
1565 unsigned char *p = str, *endp = str + bytes;
1566 unsigned char *to;
1568 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1569 if (p == endp)
1570 return bytes;
1571 to = p;
1572 bytes = endp - p;
1573 endp = str + len;
1574 safe_bcopy (p, endp - bytes, bytes);
1575 p = endp - bytes;
1576 while (p < endp)
1578 if (*p < 0x80 || *p >= 0xA0)
1579 *to++ = *p++;
1580 else
1581 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1583 return (to - str);
1586 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1587 actually converts only 8-bit characters in the range 0x80..0x9F to
1588 unibyte forms. */
1591 str_as_unibyte (str, bytes)
1592 unsigned char *str;
1593 int bytes;
1595 unsigned char *p = str, *endp = str + bytes;
1596 unsigned char *to = str;
1598 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1599 to = p;
1600 while (p < endp)
1602 if (*p == LEADING_CODE_8_BIT_CONTROL)
1603 *to++ = *(p + 1) - 0x20, p += 2;
1604 else
1605 *to++ = *p++;
1607 return (to - str);
1611 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1612 "Concatenate all the argument characters and make the result a string.")
1613 (n, args)
1614 int n;
1615 Lisp_Object *args;
1617 int i;
1618 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1619 unsigned char *p = buf;
1620 int c;
1621 int multibyte = 0;
1623 for (i = 0; i < n; i++)
1625 CHECK_NUMBER (args[i], 0);
1626 if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i])))
1627 multibyte = 1;
1630 for (i = 0; i < n; i++)
1632 c = XINT (args[i]);
1633 if (multibyte)
1634 p += CHAR_STRING (c, p);
1635 else
1636 *p++ = c;
1639 return make_string_from_bytes (buf, n, p - buf);
1642 #endif /* emacs */
1645 charset_id_internal (charset_name)
1646 char *charset_name;
1648 Lisp_Object val;
1650 val= Fget (intern (charset_name), Qcharset);
1651 if (!VECTORP (val))
1652 error ("Charset %s is not defined", charset_name);
1654 return (XINT (XVECTOR (val)->contents[0]));
1657 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1658 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1661 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1662 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1663 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1664 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1665 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1666 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1667 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1668 return Qnil;
1671 void
1672 init_charset_once ()
1674 int i, j, k;
1676 staticpro (&Vcharset_table);
1677 staticpro (&Vcharset_symbol_table);
1678 staticpro (&Vgeneric_character_list);
1680 /* This has to be done here, before we call Fmake_char_table. */
1681 Qcharset_table = intern ("charset-table");
1682 staticpro (&Qcharset_table);
1684 /* Intern this now in case it isn't already done.
1685 Setting this variable twice is harmless.
1686 But don't staticpro it here--that is done in alloc.c. */
1687 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1689 /* Now we are ready to set up this property, so we can
1690 create the charset table. */
1691 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1692 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1694 Qunknown = intern ("unknown");
1695 staticpro (&Qunknown);
1696 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1697 Qunknown);
1699 /* Setup tables. */
1700 for (i = 0; i < 2; i++)
1701 for (j = 0; j < 2; j++)
1702 for (k = 0; k < 128; k++)
1703 iso_charset_table [i][j][k] = -1;
1705 for (i = 0; i < 256; i++)
1706 bytes_by_char_head[i] = 1;
1707 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1708 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1709 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1710 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1712 for (i = 0; i < 128; i++)
1713 width_by_char_head[i] = 1;
1714 for (; i < 256; i++)
1715 width_by_char_head[i] = 4;
1716 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1717 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1718 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1719 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1722 Lisp_Object val;
1724 val = Qnil;
1725 for (i = 0x81; i < 0x90; i++)
1726 val = Fcons (make_number ((i - 0x70) << 7), val);
1727 for (; i < 0x9A; i++)
1728 val = Fcons (make_number ((i - 0x8F) << 14), val);
1729 for (i = 0xA0; i < 0xF0; i++)
1730 val = Fcons (make_number ((i - 0x70) << 7), val);
1731 for (; i < 0xFF; i++)
1732 val = Fcons (make_number ((i - 0xE0) << 14), val);
1733 Vgeneric_character_list = Fnreverse (val);
1736 nonascii_insert_offset = 0;
1737 Vnonascii_translation_table = Qnil;
1740 #ifdef emacs
1742 void
1743 syms_of_charset ()
1745 Qcharset = intern ("charset");
1746 staticpro (&Qcharset);
1748 Qascii = intern ("ascii");
1749 staticpro (&Qascii);
1751 Qeight_bit_control = intern ("eight-bit-control");
1752 staticpro (&Qeight_bit_control);
1754 Qeight_bit_graphic = intern ("eight-bit-graphic");
1755 staticpro (&Qeight_bit_graphic);
1757 /* Define special charsets ascii, eight-bit-control, and
1758 eight-bit-graphic. */
1759 update_charset_table (make_number (CHARSET_ASCII),
1760 make_number (1), make_number (94),
1761 make_number (1),
1762 make_number (0),
1763 make_number ('B'),
1764 make_number (0),
1765 build_string ("ASCII"),
1766 Qnil, /* same as above */
1767 build_string ("ASCII (ISO646 IRV)"));
1768 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1769 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1771 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1772 make_number (1), make_number (96),
1773 make_number (4),
1774 make_number (0),
1775 make_number (-1),
1776 make_number (-1),
1777 build_string ("8-bit control code (0x80..0x9F)"),
1778 Qnil, /* same as above */
1779 Qnil); /* same as above */
1780 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1781 Fput (Qeight_bit_control, Qcharset,
1782 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1784 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1785 make_number (1), make_number (96),
1786 make_number (4),
1787 make_number (0),
1788 make_number (-1),
1789 make_number (-1),
1790 build_string ("8-bit graphic char (0xA0..0xFF)"),
1791 Qnil, /* same as above */
1792 Qnil); /* same as above */
1793 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1794 Fput (Qeight_bit_graphic, Qcharset,
1795 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1797 Qauto_fill_chars = intern ("auto-fill-chars");
1798 staticpro (&Qauto_fill_chars);
1799 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1801 defsubr (&Sdefine_charset);
1802 defsubr (&Sgeneric_character_list);
1803 defsubr (&Sget_unused_iso_final_char);
1804 defsubr (&Sdeclare_equiv_charset);
1805 defsubr (&Sfind_charset_region);
1806 defsubr (&Sfind_charset_string);
1807 defsubr (&Smake_char_internal);
1808 defsubr (&Ssplit_char);
1809 defsubr (&Schar_charset);
1810 defsubr (&Scharset_after);
1811 defsubr (&Siso_charset);
1812 defsubr (&Schar_valid_p);
1813 defsubr (&Sunibyte_char_to_multibyte);
1814 defsubr (&Smultibyte_char_to_unibyte);
1815 defsubr (&Schar_bytes);
1816 defsubr (&Schar_width);
1817 defsubr (&Sstring_width);
1818 defsubr (&Schar_direction);
1819 defsubr (&Schars_in_region);
1820 defsubr (&Sstring);
1821 defsubr (&Ssetup_special_charsets);
1823 DEFVAR_LISP ("charset-list", &Vcharset_list,
1824 "List of charsets ever defined.");
1825 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1826 Fcons (Qeight_bit_graphic, Qnil)));
1828 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1829 "Vector of cons cell of a symbol and translation table ever defined.\n\
1830 An ID of a translation table is an index of this vector.");
1831 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1833 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1834 "Leading-code of private TYPE9N charset of column-width 1.");
1835 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1837 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1838 "Leading-code of private TYPE9N charset of column-width 2.");
1839 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1841 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1842 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1843 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1845 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1846 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1847 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1849 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1850 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1851 This is used for converting unibyte text to multibyte,\n\
1852 and for inserting character codes specified by number.\n\n\
1853 This serves to convert a Latin-1 or similar 8-bit character code\n\
1854 to the corresponding Emacs multibyte character code.\n\
1855 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1856 for your choice of character set.\n\
1857 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1858 nonascii_insert_offset = 0;
1860 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1861 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1862 This is used for converting unibyte text to multibyte,\n\
1863 and for inserting character codes specified by number.\n\n\
1864 Conversion is performed only when multibyte characters are enabled,\n\
1865 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1866 to the corresponding Emacs character code.\n\n\
1867 If this is nil, `nonascii-insert-offset' is used instead.\n\
1868 See also the docstring of `make-translation-table'.");
1869 Vnonascii_translation_table = Qnil;
1871 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1872 "A char-table for characters which invoke auto-filling.\n\
1873 Such characters have value t in this table.");
1874 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1875 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1876 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1879 #endif /* emacs */