Docstring fixes.
[emacs.git] / src / charset.c
blob220d6749c67ae8d5789142e147ac91159c342509
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
25 #ifdef emacs
26 #include <config.h>
27 #endif
29 #include <stdio.h>
31 #ifdef emacs
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "composite.h"
38 #include "coding.h"
39 #include "disptab.h"
41 #else /* not emacs */
43 #include "mulelib.h"
45 #endif /* emacs */
47 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
48 Lisp_Object Qunknown;
50 /* Declaration of special leading-codes. */
51 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
52 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
53 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
54 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
56 /* Declaration of special charsets. The values are set by
57 Fsetup_special_charsets. */
58 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
59 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
60 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
61 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
62 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
63 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
64 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
66 Lisp_Object Qcharset_table;
68 /* A char-table containing information of each character set. */
69 Lisp_Object Vcharset_table;
71 /* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73 Lisp_Object Vcharset_symbol_table;
75 /* A list of charset symbols ever defined. */
76 Lisp_Object Vcharset_list;
78 /* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80 Lisp_Object Vtranslation_table_vector;
82 /* A char-table for characters which may invoke auto-filling. */
83 Lisp_Object Vauto_fill_chars;
85 Lisp_Object Qauto_fill_chars;
87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88 int bytes_by_char_head[256];
89 int width_by_char_head[256];
91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93 int iso_charset_table[2][2][128];
95 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
96 unsigned char *_fetch_multibyte_char_p;
97 int _fetch_multibyte_char_len;
99 /* Offset to add to a non-ASCII value when inserting it. */
100 int nonascii_insert_offset;
102 /* Translation table for converting non-ASCII unibyte characters
103 to multibyte codes, or nil. */
104 Lisp_Object Vnonascii_translation_table;
106 /* List of all possible generic characters. */
107 Lisp_Object Vgeneric_character_list;
109 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
110 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
112 void
113 invalid_character (c)
114 int c;
116 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
119 /* Parse string STR of length LENGTH and fetch information of a
120 character at STR. Set BYTES to the byte length the character
121 occupies, CHARSET, C1, C2 to proper values of the character. */
123 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
124 do { \
125 (c1) = *(str); \
126 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
127 if ((bytes) == 1) \
128 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
129 else if ((bytes) == 2) \
131 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
132 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
133 else \
134 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
136 else if ((bytes) == 3) \
138 if ((c1) < LEADING_CODE_PRIVATE_11) \
139 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
140 else \
141 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
143 else \
144 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
145 } while (0)
147 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
148 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
149 ((charset) == CHARSET_ASCII \
150 ? ((c1) >= 0 && (c1) <= 0x7F) \
151 : ((charset) == CHARSET_8_BIT_CONTROL \
152 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
153 : ((charset) == CHARSET_8_BIT_GRAPHIC \
154 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
155 : (CHARSET_DIMENSION (charset) == 1 \
156 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
157 : ((c1) >= 0x20 && (c1) <= 0x7F \
158 && (c2) >= 0x20 && (c2) <= 0x7F)))))
160 /* Store multi-byte form of the character C in STR. The caller should
161 allocate at least 4-byte area at STR in advance. Returns the
162 length of the multi-byte form. If C is an invalid character code,
163 signal an error.
165 Use macro `CHAR_STRING (C, STR)' instead of calling this function
166 directly if C can be an ASCII character. */
169 char_to_string (c, str)
170 int c;
171 unsigned char *str;
173 unsigned char *p = str;
175 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
177 /* Multibyte character can't have a modifier bit. */
178 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
179 invalid_character (c);
181 /* For Meta, Shift, and Control modifiers, we need special care. */
182 if (c & CHAR_META)
184 /* Move the meta bit to the right place for a string. */
185 c = (c & ~CHAR_META) | 0x80;
187 if (c & CHAR_SHIFT)
189 /* Shift modifier is valid only with [A-Za-z]. */
190 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
191 c &= ~CHAR_SHIFT;
192 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
193 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
195 if (c & CHAR_CTL)
197 /* Simulate the code in lread.c. */
198 /* Allow `\C- ' and `\C-?'. */
199 if (c == (CHAR_CTL | ' '))
200 c = 0;
201 else if (c == (CHAR_CTL | '?'))
202 c = 127;
203 /* ASCII control chars are made from letters (both cases),
204 as well as the non-letters within 0100...0137. */
205 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
206 c &= (037 | (~0177 & ~CHAR_CTL));
207 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
208 c &= (037 | (~0177 & ~CHAR_CTL));
211 /* If C still has any modifier bits, it is an invalid character. */
212 if (c & CHAR_MODIFIER_MASK)
213 invalid_character (c);
215 if (SINGLE_BYTE_CHAR_P (c))
217 if (ASCII_BYTE_P (c) || c >= 0xA0)
218 *p++ = c;
219 else
221 *p++ = LEADING_CODE_8_BIT_CONTROL;
222 *p++ = c + 0x20;
225 else if (CHAR_VALID_P (c, 0))
227 int charset, c1, c2;
229 SPLIT_CHAR (c, charset, c1, c2);
231 if (charset >= LEADING_CODE_EXT_11)
232 *p++ = (charset < LEADING_CODE_EXT_12
233 ? LEADING_CODE_PRIVATE_11
234 : (charset < LEADING_CODE_EXT_21
235 ? LEADING_CODE_PRIVATE_12
236 : (charset < LEADING_CODE_EXT_22
237 ? LEADING_CODE_PRIVATE_21
238 : LEADING_CODE_PRIVATE_22)));
239 *p++ = charset;
240 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
241 invalid_character (c);
242 if (c1)
244 *p++ = c1 | 0x80;
245 if (c2 > 0)
246 *p++ = c2 | 0x80;
249 else
250 invalid_character (c);
252 return (p - str);
255 /* Return the non-ASCII character corresponding to multi-byte form at
256 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
257 length of the multibyte form in *ACTUAL_LEN.
259 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
260 this function directly if you want ot handle ASCII characters as
261 well. */
264 string_to_char (str, len, actual_len)
265 const unsigned char *str;
266 int len, *actual_len;
268 int c, bytes, charset, c1, c2;
270 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
271 c = MAKE_CHAR (charset, c1, c2);
272 if (actual_len)
273 *actual_len = bytes;
274 return c;
277 /* Return the length of the multi-byte form at string STR of length LEN.
278 Use the macro MULTIBYTE_FORM_LENGTH instead. */
280 multibyte_form_length (str, len)
281 const unsigned char *str;
282 int len;
284 int bytes;
286 PARSE_MULTIBYTE_SEQ (str, len, bytes);
287 return bytes;
290 /* Check multibyte form at string STR of length LEN and set variables
291 pointed by CHARSET, C1, and C2 to charset and position codes of the
292 character at STR, and return 0. If there's no multibyte character,
293 return -1. This should be used only in the macro SPLIT_STRING
294 which checks range of STR in advance. */
297 split_string (str, len, charset, c1, c2)
298 const unsigned char *str;
299 unsigned char *c1, *c2;
300 int len, *charset;
302 register int bytes, cs, code1, code2 = -1;
304 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
305 if (cs == CHARSET_ASCII)
306 return -1;
307 *charset = cs;
308 *c1 = code1;
309 *c2 = code2;
310 return 0;
313 /* Return 1 iff character C has valid printable glyph.
314 Use the macro CHAR_PRINTABLE_P instead. */
316 char_printable_p (c)
317 int c;
319 int charset, c1, c2, chars;
321 if (ASCII_BYTE_P (c))
322 return 1;
323 else if (SINGLE_BYTE_CHAR_P (c))
324 return 0;
325 else if (c >= MAX_CHAR)
326 return 0;
328 SPLIT_CHAR (c, charset, c1, c2);
329 if (! CHARSET_DEFINED_P (charset))
330 return 0;
331 if (CHARSET_CHARS (charset) == 94
332 ? c1 <= 32 || c1 >= 127
333 : c1 < 32)
334 return 0;
335 if (CHARSET_DIMENSION (charset) == 2
336 && (CHARSET_CHARS (charset) == 94
337 ? c2 <= 32 || c2 >= 127
338 : c2 < 32))
339 return 0;
340 return 1;
343 /* Translate character C by translation table TABLE. If C
344 is negative, translate a character specified by CHARSET, C1, and C2
345 (C1 and C2 are code points of the character). If no translation is
346 found in TABLE, return C. */
348 translate_char (table, c, charset, c1, c2)
349 Lisp_Object table;
350 int c, charset, c1, c2;
352 Lisp_Object ch;
353 int alt_charset, alt_c1, alt_c2, dimension;
355 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
356 if (!CHAR_TABLE_P (table)
357 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
358 return c;
360 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
361 dimension = CHARSET_DIMENSION (alt_charset);
362 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
363 /* CH is not a generic character, just return it. */
364 return XFASTINT (ch);
366 /* Since CH is a generic character, we must return a specific
367 charater which has the same position codes as C from CH. */
368 if (charset < 0)
369 SPLIT_CHAR (c, charset, c1, c2);
370 if (dimension != CHARSET_DIMENSION (charset))
371 /* We can't make such a character because of dimension mismatch. */
372 return c;
373 return MAKE_CHAR (alt_charset, c1, c2);
376 /* Convert the unibyte character C to multibyte based on
377 Vnonascii_translation_table or nonascii_insert_offset. If they can't
378 convert C to a valid multibyte character, convert it based on
379 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
382 unibyte_char_to_multibyte (c)
383 int c;
385 if (c < 0400 && c >= 0200)
387 int c_save = c;
389 if (! NILP (Vnonascii_translation_table))
391 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
392 if (c >= 0400 && ! char_valid_p (c, 0))
393 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
395 else if (c >= 0240 && nonascii_insert_offset > 0)
397 c += nonascii_insert_offset;
398 if (c < 0400 || ! char_valid_p (c, 0))
399 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
401 else if (c >= 0240)
402 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
404 return c;
408 /* Convert the multibyte character C to unibyte 8-bit character based
409 on Vnonascii_translation_table or nonascii_insert_offset. If
410 REV_TBL is non-nil, it should be a reverse table of
411 Vnonascii_translation_table, i.e. what given by:
412 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
415 multibyte_char_to_unibyte (c, rev_tbl)
416 int c;
417 Lisp_Object rev_tbl;
419 if (!SINGLE_BYTE_CHAR_P (c))
421 int c_save = c;
423 if (! CHAR_TABLE_P (rev_tbl)
424 && CHAR_TABLE_P (Vnonascii_translation_table))
425 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
426 make_number (0));
427 if (CHAR_TABLE_P (rev_tbl))
429 Lisp_Object temp;
430 temp = Faref (rev_tbl, make_number (c));
431 if (INTEGERP (temp))
432 c = XINT (temp);
433 if (c >= 256)
434 c = (c_save & 0177) + 0200;
436 else
438 if (nonascii_insert_offset > 0)
439 c -= nonascii_insert_offset;
440 if (c < 128 || c >= 256)
441 c = (c_save & 0177) + 0200;
445 return c;
449 /* Update the table Vcharset_table with the given arguments (see the
450 document of `define-charset' for the meaning of each argument).
451 Several other table contents are also updated. The caller should
452 check the validity of CHARSET-ID and the remaining arguments in
453 advance. */
455 void
456 update_charset_table (charset_id, dimension, chars, width, direction,
457 iso_final_char, iso_graphic_plane,
458 short_name, long_name, description)
459 Lisp_Object charset_id, dimension, chars, width, direction;
460 Lisp_Object iso_final_char, iso_graphic_plane;
461 Lisp_Object short_name, long_name, description;
463 int charset = XINT (charset_id);
464 int bytes;
465 unsigned char leading_code_base, leading_code_ext;
467 if (NILP (CHARSET_TABLE_ENTRY (charset)))
468 CHARSET_TABLE_ENTRY (charset)
469 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
471 if (NILP (long_name))
472 long_name = short_name;
473 if (NILP (description))
474 description = long_name;
476 /* Get byte length of multibyte form, base leading-code, and
477 extended leading-code of the charset. See the comment under the
478 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
479 bytes = XINT (dimension);
480 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
482 /* Official charset, it doesn't have an extended leading-code. */
483 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
484 bytes += 1; /* For a base leading-code. */
485 leading_code_base = charset;
486 leading_code_ext = 0;
488 else
490 /* Private charset. */
491 bytes += 2; /* For base and extended leading-codes. */
492 leading_code_base
493 = (charset < LEADING_CODE_EXT_12
494 ? LEADING_CODE_PRIVATE_11
495 : (charset < LEADING_CODE_EXT_21
496 ? LEADING_CODE_PRIVATE_12
497 : (charset < LEADING_CODE_EXT_22
498 ? LEADING_CODE_PRIVATE_21
499 : LEADING_CODE_PRIVATE_22)));
500 leading_code_ext = charset;
501 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
502 error ("Invalid dimension for the charset-ID %d", charset);
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
506 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
507 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
508 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
509 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
510 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
511 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
512 = make_number (leading_code_base);
513 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
514 = make_number (leading_code_ext);
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
516 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
517 = iso_graphic_plane;
518 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
520 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
521 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
528 is set to nil. */
529 int i;
531 for (i = 0; i <= MAX_CHARSET; i++)
532 if (!NILP (CHARSET_TABLE_ENTRY (i)))
534 if (CHARSET_DIMENSION (i) == XINT (dimension)
535 && CHARSET_CHARS (i) == XINT (chars)
536 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
537 && CHARSET_DIRECTION (i) != XINT (direction))
539 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
540 = make_number (i);
541 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
542 break;
545 if (i > MAX_CHARSET)
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
548 = make_number (-1);
551 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
552 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
554 bytes_by_char_head[leading_code_base] = bytes;
555 width_by_char_head[leading_code_base] = XINT (width);
557 /* Update table emacs_code_class. */
558 emacs_code_class[charset] = (bytes == 2
559 ? EMACS_leading_code_2
560 : (bytes == 3
561 ? EMACS_leading_code_3
562 : EMACS_leading_code_4));
565 /* Update table iso_charset_table. */
566 if (XINT (iso_final_char) >= 0
567 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
568 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
571 #ifdef emacs
573 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
574 is invalid. */
576 get_charset_id (charset_symbol)
577 Lisp_Object charset_symbol;
579 Lisp_Object val;
580 int charset;
582 return ((SYMBOLP (charset_symbol)
583 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
584 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
585 CHARSET_VALID_P (charset)))
586 ? charset : -1);
589 /* Return an identification number for a new private charset of
590 DIMENSION and WIDTH. If there's no more room for the new charset,
591 return 0. */
592 Lisp_Object
593 get_new_private_charset_id (dimension, width)
594 int dimension, width;
596 int charset, from, to;
598 if (dimension == 1)
600 if (width == 1)
601 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
602 else
603 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
605 else
607 if (width == 1)
608 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
609 else
610 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
613 for (charset = from; charset < to; charset++)
614 if (!CHARSET_DEFINED_P (charset)) break;
616 return make_number (charset < to ? charset : 0);
619 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
620 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
621 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
622 treated as a private charset.\n\
623 INFO-VECTOR is a vector of the format:\n\
624 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
625 SHORT-NAME LONG-NAME DESCRIPTION]\n\
626 The meanings of each elements is as follows:\n\
627 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
628 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
629 WIDTH (integer) is the number of columns a character in the charset\n\
630 occupies on the screen: one of 0, 1, and 2.\n\
632 DIRECTION (integer) is the rendering direction of characters in the\n\
633 charset when rendering. If 0, render from left to right, else\n\
634 render from right to left.\n\
636 ISO-FINAL-CHAR (character) is the final character of the\n\
637 corresponding ISO 2022 charset.\n\
638 It may be -1 if the charset is internal use only.\n\
640 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
641 while encoding to variants of ISO 2022 coding system, one of the\n\
642 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
643 It may be -1 if the charset is internal use only.\n\
645 SHORT-NAME (string) is the short name to refer to the charset.\n\
647 LONG-NAME (string) is the long name to refer to the charset.\n\
649 DESCRIPTION (string) is the description string of the charset.")
650 (charset_id, charset_symbol, info_vector)
651 Lisp_Object charset_id, charset_symbol, info_vector;
653 Lisp_Object *vec;
655 if (!NILP (charset_id))
656 CHECK_NUMBER (charset_id, 0);
657 CHECK_SYMBOL (charset_symbol, 1);
658 CHECK_VECTOR (info_vector, 2);
660 if (! NILP (charset_id))
662 if (! CHARSET_VALID_P (XINT (charset_id)))
663 error ("Invalid CHARSET: %d", XINT (charset_id));
664 else if (CHARSET_DEFINED_P (XINT (charset_id)))
665 error ("Already defined charset: %d", XINT (charset_id));
668 vec = XVECTOR (info_vector)->contents;
669 if (XVECTOR (info_vector)->size != 9
670 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
671 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
672 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
673 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
674 || !INTEGERP (vec[4])
675 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
676 || !INTEGERP (vec[5])
677 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
678 || !STRINGP (vec[6])
679 || !STRINGP (vec[7])
680 || !STRINGP (vec[8]))
681 error ("Invalid info-vector argument for defining charset %s",
682 XSYMBOL (charset_symbol)->name->data);
684 if (NILP (charset_id))
686 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
687 if (XINT (charset_id) == 0)
688 error ("There's no room for a new private charset %s",
689 XSYMBOL (charset_symbol)->name->data);
692 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
693 vec[4], vec[5], vec[6], vec[7], vec[8]);
694 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
695 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
696 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
697 return Qnil;
700 DEFUN ("generic-character-list", Fgeneric_character_list,
701 Sgeneric_character_list, 0, 0, 0,
702 "Return a list of all possible generic characters.\n\
703 It includes a generic character for a charset not yet defined.")
706 return Vgeneric_character_list;
709 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
710 Sget_unused_iso_final_char, 2, 2, 0,
711 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
712 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
713 CHARS is the number of characters in a dimension: 94 or 96.\n\
715 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
716 If there's no unused final char for the specified kind of charset,\n\
717 return nil.")
718 (dimension, chars)
719 Lisp_Object dimension, chars;
721 int final_char;
723 CHECK_NUMBER (dimension, 0);
724 CHECK_NUMBER (chars, 1);
725 if (XINT (dimension) != 1 && XINT (dimension) != 2)
726 error ("Invalid charset dimension %d, it should be 1 or 2",
727 XINT (dimension));
728 if (XINT (chars) != 94 && XINT (chars) != 96)
729 error ("Invalid charset chars %d, it should be 94 or 96",
730 XINT (chars));
731 for (final_char = '0'; final_char <= '?'; final_char++)
733 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
734 break;
736 return (final_char <= '?' ? make_number (final_char) : Qnil);
739 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
740 4, 4, 0,
741 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
742 CHARSET should be defined by `defined-charset' in advance.")
743 (dimension, chars, final_char, charset_symbol)
744 Lisp_Object dimension, chars, final_char, charset_symbol;
746 int charset;
748 CHECK_NUMBER (dimension, 0);
749 CHECK_NUMBER (chars, 1);
750 CHECK_NUMBER (final_char, 2);
751 CHECK_SYMBOL (charset_symbol, 3);
753 if (XINT (dimension) != 1 && XINT (dimension) != 2)
754 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
755 if (XINT (chars) != 94 && XINT (chars) != 96)
756 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
757 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
758 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
759 if ((charset = get_charset_id (charset_symbol)) < 0)
760 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
762 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
763 return Qnil;
766 /* Return information about charsets in the text at PTR of NBYTES
767 bytes, which are NCHARS characters. The value is:
769 0: Each character is represented by one byte. This is always
770 true for unibyte text.
771 1: No charsets other than ascii eight-bit-control,
772 eight-bit-graphic, and latin-1 are found.
773 2: Otherwise.
775 In addition, if CHARSETS is nonzero, for each found charset N, set
776 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
777 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
778 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
779 1 (note that there's no charset whose ID is 1). */
782 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
783 unsigned char *ptr;
784 int nchars, nbytes, *charsets;
785 Lisp_Object table;
787 if (nchars == nbytes)
789 if (charsets && nbytes > 0)
791 unsigned char *endp = ptr + nbytes;
792 int maskbits = 0;
794 while (ptr < endp && maskbits != 7)
796 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
797 ptr++;
800 if (maskbits & 1)
801 charsets[CHARSET_ASCII] = 1;
802 if (maskbits & 2)
803 charsets[CHARSET_8_BIT_CONTROL] = 1;
804 if (maskbits & 4)
805 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
807 return 0;
809 else
811 int return_val = 1;
812 int bytes, charset, c1, c2;
814 if (! CHAR_TABLE_P (table))
815 table = Qnil;
817 while (nchars-- > 0)
819 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
820 ptr += bytes;
822 if (!CHARSET_DEFINED_P (charset))
823 charset = 1;
824 else if (! NILP (table))
826 int c = translate_char (table, -1, charset, c1, c2);
827 if (c >= 0)
828 charset = CHAR_CHARSET (c);
831 if (return_val == 1
832 && charset != CHARSET_ASCII
833 && charset != CHARSET_8_BIT_CONTROL
834 && charset != CHARSET_8_BIT_GRAPHIC
835 && charset != charset_latin_iso8859_1)
836 return_val = 2;
838 if (charsets)
839 charsets[charset] = 1;
840 else if (return_val == 2)
841 break;
843 return return_val;
847 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
848 2, 3, 0,
849 "Return a list of charsets in the region between BEG and END.\n\
850 BEG and END are buffer positions.\n\
851 Optional arg TABLE if non-nil is a translation table to look up.\n\
853 If the region contains invalid multiybte characters,\n\
854 `unknown' is included in the returned list.\n\
856 If the current buffer is unibyte, the returned list may contain\n\
857 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
858 (beg, end, table)
859 Lisp_Object beg, end, table;
861 int charsets[MAX_CHARSET + 1];
862 int from, from_byte, to, stop, stop_byte, i;
863 Lisp_Object val;
865 validate_region (&beg, &end);
866 from = XFASTINT (beg);
867 stop = to = XFASTINT (end);
869 if (from < GPT && GPT < to)
871 stop = GPT;
872 stop_byte = GPT_BYTE;
874 else
875 stop_byte = CHAR_TO_BYTE (stop);
877 from_byte = CHAR_TO_BYTE (from);
879 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
880 while (1)
882 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
883 stop_byte - from_byte, charsets, table);
884 if (stop < to)
886 from = stop, from_byte = stop_byte;
887 stop = to, stop_byte = CHAR_TO_BYTE (stop);
889 else
890 break;
893 val = Qnil;
894 if (charsets[1])
895 val = Fcons (Qunknown, val);
896 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
897 if (charsets[i])
898 val = Fcons (CHARSET_SYMBOL (i), val);
899 if (charsets[0])
900 val = Fcons (Qascii, val);
901 return val;
904 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
905 1, 2, 0,
906 "Return a list of charsets in STR.\n\
907 Optional arg TABLE if non-nil is a translation table to look up.\n\
909 If the string contains invalid multiybte characters,\n\
910 `unknown' is included in the returned list.\n\
912 If STR is unibyte, the returned list may contain\n\
913 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
914 (str, table)
915 Lisp_Object str, table;
917 int charsets[MAX_CHARSET + 1];
918 int i;
919 Lisp_Object val;
921 CHECK_STRING (str, 0);
923 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
924 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
925 STRING_BYTES (XSTRING (str)), charsets, table);
927 val = Qnil;
928 if (charsets[1])
929 val = Fcons (Qunknown, val);
930 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
931 if (charsets[i])
932 val = Fcons (CHARSET_SYMBOL (i), val);
933 if (charsets[0])
934 val = Fcons (Qascii, val);
935 return val;
939 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
941 (charset, code1, code2)
942 Lisp_Object charset, code1, code2;
944 int charset_id, c1, c2;
946 CHECK_NUMBER (charset, 0);
947 charset_id = XINT (charset);
948 if (!CHARSET_DEFINED_P (charset_id))
949 error ("Invalid charset ID: %d", XINT (charset));
951 if (NILP (code1))
952 c1 = 0;
953 else
955 CHECK_NUMBER (code1, 1);
956 c1 = XINT (code1);
958 if (NILP (code2))
959 c2 = 0;
960 else
962 CHECK_NUMBER (code2, 2);
963 c2 = XINT (code2);
966 if (charset_id == CHARSET_ASCII)
968 if (c1 < 0 || c1 > 0x7F)
969 goto invalid_code_posints;
970 return make_number (c1);
972 else if (charset_id == CHARSET_8_BIT_CONTROL)
974 if (c1 < 0x80 || c1 > 0x9F)
975 goto invalid_code_posints;
976 return make_number (c1);
978 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
980 if (c1 < 0xA0 || c1 > 0xFF)
981 goto invalid_code_posints;
982 return make_number (c1);
984 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
985 goto invalid_code_posints;
986 c1 &= 0x7F;
987 c2 &= 0x7F;
988 if (c1 == 0
989 ? c2 != 0
990 : (c2 == 0
991 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
992 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
993 goto invalid_code_posints;
994 return make_number (MAKE_CHAR (charset_id, c1, c2));
996 invalid_code_posints:
997 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1000 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1001 "Return list of charset and one or two position-codes of CHAR.\n\
1002 If CHAR is invalid as a character code,\n\
1003 return a list of symbol `unknown' and CHAR.")
1004 (ch)
1005 Lisp_Object ch;
1007 Lisp_Object val;
1008 int c, charset, c1, c2;
1010 CHECK_NUMBER (ch, 0);
1011 c = XFASTINT (ch);
1012 if (!CHAR_VALID_P (c, 1))
1013 return Fcons (Qunknown, Fcons (ch, Qnil));
1014 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1015 return (c2 >= 0
1016 ? Fcons (CHARSET_SYMBOL (charset),
1017 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1018 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1021 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1022 "Return charset of CHAR.")
1023 (ch)
1024 Lisp_Object ch;
1026 CHECK_NUMBER (ch, 0);
1028 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1031 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1032 "Return charset of a character in the current buffer at position POS.\n\
1033 If POS is nil, it defauls to the current point.\n\
1034 If POS is out of range, the value is nil.")
1035 (pos)
1036 Lisp_Object pos;
1038 Lisp_Object ch;
1039 int charset;
1041 ch = Fchar_after (pos);
1042 if (! INTEGERP (ch))
1043 return ch;
1044 charset = CHAR_CHARSET (XINT (ch));
1045 return CHARSET_SYMBOL (charset);
1048 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1049 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1051 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1052 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1053 where as Emacs distinguishes them by charset symbol.\n\
1054 See the documentation of the function `charset-info' for the meanings of\n\
1055 DIMENSION, CHARS, and FINAL-CHAR.")
1056 (dimension, chars, final_char)
1057 Lisp_Object dimension, chars, final_char;
1059 int charset;
1061 CHECK_NUMBER (dimension, 0);
1062 CHECK_NUMBER (chars, 1);
1063 CHECK_NUMBER (final_char, 2);
1065 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1066 return Qnil;
1067 return CHARSET_SYMBOL (charset);
1070 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1071 generic character. If GENERICP is zero, return nonzero iff C is a
1072 valid normal character. Do not call this function directly,
1073 instead use macro CHAR_VALID_P. */
1075 char_valid_p (c, genericp)
1076 int c, genericp;
1078 int charset, c1, c2;
1080 if (c < 0 || c >= MAX_CHAR)
1081 return 0;
1082 if (SINGLE_BYTE_CHAR_P (c))
1083 return 1;
1084 SPLIT_CHAR (c, charset, c1, c2);
1085 if (genericp)
1087 if (c1)
1089 if (c2 <= 0) c2 = 0x20;
1091 else
1093 if (c2 <= 0) c1 = c2 = 0x20;
1096 return (CHARSET_DEFINED_P (charset)
1097 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1100 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1101 "Return t if OBJECT is a valid normal character.\n\
1102 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1103 a valid generic character.")
1104 (object, genericp)
1105 Lisp_Object object, genericp;
1107 if (! NATNUMP (object))
1108 return Qnil;
1109 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1112 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1113 Sunibyte_char_to_multibyte, 1, 1, 0,
1114 "Convert the unibyte character CH to multibyte character.\n\
1115 The conversion is done based on `nonascii-translation-table' (which see)\n\
1116 or `nonascii-insert-offset' (which see).")
1117 (ch)
1118 Lisp_Object ch;
1120 int c;
1122 CHECK_NUMBER (ch, 0);
1123 c = XINT (ch);
1124 if (c < 0 || c >= 0400)
1125 error ("Invalid unibyte character: %d", c);
1126 c = unibyte_char_to_multibyte (c);
1127 if (c < 0)
1128 error ("Can't convert to multibyte character: %d", XINT (ch));
1129 return make_number (c);
1132 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1133 Smultibyte_char_to_unibyte, 1, 1, 0,
1134 "Convert the multibyte character CH to unibyte character.\n\
1135 The conversion is done based on `nonascii-translation-table' (which see)\n\
1136 or `nonascii-insert-offset' (which see).")
1137 (ch)
1138 Lisp_Object ch;
1140 int c;
1142 CHECK_NUMBER (ch, 0);
1143 c = XINT (ch);
1144 if (! CHAR_VALID_P (c, 0))
1145 error ("Invalid multibyte character: %d", c);
1146 c = multibyte_char_to_unibyte (c, Qnil);
1147 if (c < 0)
1148 error ("Can't convert to unibyte character: %d", XINT (ch));
1149 return make_number (c);
1152 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1153 "Return 1 regardless of the argument CHAR.\n\
1154 This is now an obsolete function. We keep it just for backward compatibility.")
1155 (ch)
1156 Lisp_Object ch;
1158 Lisp_Object val;
1160 CHECK_NUMBER (ch, 0);
1161 return make_number (1);
1164 /* Return how many bytes C will occupy in a multibyte buffer.
1165 Don't call this function directly, instead use macro CHAR_BYTES. */
1167 char_bytes (c)
1168 int c;
1170 int charset;
1172 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1173 return 1;
1174 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1175 return 1;
1177 charset = CHAR_CHARSET (c);
1178 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1181 /* Return the width of character of which multi-byte form starts with
1182 C. The width is measured by how many columns occupied on the
1183 screen when displayed in the current buffer. */
1185 #define ONE_BYTE_CHAR_WIDTH(c) \
1186 (c < 0x20 \
1187 ? (c == '\t' \
1188 ? XFASTINT (current_buffer->tab_width) \
1189 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1190 : (c < 0x7f \
1191 ? 1 \
1192 : (c == 0x7F \
1193 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1194 : ((! NILP (current_buffer->enable_multibyte_characters) \
1195 && BASE_LEADING_CODE_P (c)) \
1196 ? WIDTH_BY_CHAR_HEAD (c) \
1197 : 4))))
1199 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1200 "Return width of CHAR when displayed in the current buffer.\n\
1201 The width is measured by how many columns it occupies on the screen.\n\
1202 Tab is taken to occupy `tab-width' columns.")
1203 (ch)
1204 Lisp_Object ch;
1206 Lisp_Object val, disp;
1207 int c;
1208 struct Lisp_Char_Table *dp = buffer_display_table ();
1210 CHECK_NUMBER (ch, 0);
1212 c = XINT (ch);
1214 /* Get the way the display table would display it. */
1215 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1217 if (VECTORP (disp))
1218 XSETINT (val, XVECTOR (disp)->size);
1219 else if (SINGLE_BYTE_CHAR_P (c))
1220 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1221 else
1223 int charset = CHAR_CHARSET (c);
1225 XSETFASTINT (val, CHARSET_WIDTH (charset));
1227 return val;
1230 /* Return width of string STR of length LEN when displayed in the
1231 current buffer. The width is measured by how many columns it
1232 occupies on the screen. */
1235 strwidth (str, len)
1236 unsigned char *str;
1237 int len;
1239 unsigned char *endp = str + len;
1240 int width = 0;
1241 struct Lisp_Char_Table *dp = buffer_display_table ();
1243 while (str < endp)
1245 Lisp_Object disp;
1246 int thislen;
1247 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1249 /* Get the way the display table would display it. */
1250 if (dp)
1251 disp = DISP_CHAR_VECTOR (dp, c);
1252 else
1253 disp = Qnil;
1255 if (VECTORP (disp))
1256 width += XVECTOR (disp)->size;
1257 else
1258 width += ONE_BYTE_CHAR_WIDTH (*str);
1260 str += thislen;
1262 return width;
1266 lisp_string_width (str)
1267 Lisp_Object str;
1269 int len = XSTRING (str)->size, len_byte = STRING_BYTES (XSTRING (str));
1270 int i = 0, i_byte;
1271 int width = 0;
1272 int start, end, start_byte;
1273 Lisp_Object prop;
1274 int cmp_id;
1276 while (i < len)
1278 if (find_composition (i, len, &start, &end, &prop, str))
1280 start_byte = string_char_to_byte (str, start);
1281 if (i < start)
1283 i_byte = string_char_to_byte (str, i);
1284 width += strwidth (XSTRING (str)->data + i_byte,
1285 start_byte - i_byte);
1287 cmp_id
1288 = get_composition_id (start, start_byte, end - start, prop, str);
1289 if (cmp_id >= 0)
1290 width += composition_table[cmp_id]->width;
1291 i = end;
1293 else
1295 i_byte = string_char_to_byte (str, i);
1296 width += strwidth (XSTRING (str)->data + i_byte, len_byte - i_byte);
1297 i = len;
1300 return width;
1303 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1304 "Return width of STRING when displayed in the current buffer.\n\
1305 Width is measured by how many columns it occupies on the screen.\n\
1306 When calculating width of a multibyte character in STRING,\n\
1307 only the base leading-code is considered; the validity of\n\
1308 the following bytes is not checked. Tabs in STRING are always\n\
1309 taken to occupy `tab-width' columns.")
1310 (str)
1311 Lisp_Object str;
1313 Lisp_Object val;
1315 CHECK_STRING (str, 0);
1316 XSETFASTINT (val, lisp_string_width (str));
1317 return val;
1320 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1321 "Return the direction of CHAR.\n\
1322 The returned value is 0 for left-to-right and 1 for right-to-left.")
1323 (ch)
1324 Lisp_Object ch;
1326 int charset;
1328 CHECK_NUMBER (ch, 0);
1329 charset = CHAR_CHARSET (XFASTINT (ch));
1330 if (!CHARSET_DEFINED_P (charset))
1331 invalid_character (XINT (ch));
1332 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1335 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1336 "Return number of characters between BEG and END.")
1337 (beg, end)
1338 Lisp_Object beg, end;
1340 int from, to;
1342 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1343 CHECK_NUMBER_COERCE_MARKER (end, 1);
1345 from = min (XFASTINT (beg), XFASTINT (end));
1346 to = max (XFASTINT (beg), XFASTINT (end));
1348 return make_number (to - from);
1351 /* Return the number of characters in the NBYTES bytes at PTR.
1352 This works by looking at the contents and checking for multibyte sequences.
1353 However, if the current buffer has enable-multibyte-characters = nil,
1354 we treat each byte as a character. */
1357 chars_in_text (ptr, nbytes)
1358 unsigned char *ptr;
1359 int nbytes;
1361 /* current_buffer is null at early stages of Emacs initialization. */
1362 if (current_buffer == 0
1363 || NILP (current_buffer->enable_multibyte_characters))
1364 return nbytes;
1366 return multibyte_chars_in_text (ptr, nbytes);
1369 /* Return the number of characters in the NBYTES bytes at PTR.
1370 This works by looking at the contents and checking for multibyte sequences.
1371 It ignores enable-multibyte-characters. */
1374 multibyte_chars_in_text (ptr, nbytes)
1375 unsigned char *ptr;
1376 int nbytes;
1378 unsigned char *endp;
1379 int chars, bytes;
1381 endp = ptr + nbytes;
1382 chars = 0;
1384 while (ptr < endp)
1386 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1387 ptr += bytes;
1388 chars++;
1391 return chars;
1394 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1395 count the numbers of characters and bytes in it. On counting
1396 bytes, pay attention to that 8-bit characters in the range
1397 0x80..0x9F are represented by 2-byte in a multibyte text. */
1398 void
1399 parse_str_as_multibyte (str, len, nchars, nbytes)
1400 unsigned char *str;
1401 int len, *nchars, *nbytes;
1403 unsigned char *endp = str + len;
1404 int n, chars = 0, bytes = 0;
1406 while (str < endp)
1408 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1409 str += n, bytes += n;
1410 else
1411 str++, bytes += 2;
1412 chars++;
1414 *nchars = chars;
1415 *nbytes = bytes;
1416 return;
1419 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1420 It actually converts only 8-bit characters in the range 0x80..0x9F
1421 that don't contruct multibyte characters to multibyte forms. If
1422 NCHARS is nonzero, set *NCHARS to the number of characters in the
1423 text. It is assured that we can use LEN bytes at STR as a work
1424 area and that is enough. Return the number of bytes of the
1425 resulting text. */
1428 str_as_multibyte (str, len, nbytes, nchars)
1429 unsigned char *str;
1430 int len, nbytes, *nchars;
1432 unsigned char *p = str, *endp = str + nbytes;
1433 unsigned char *to;
1434 int chars = 0;
1435 int n;
1437 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1438 p += n, chars++;
1439 if (nchars)
1440 *nchars = chars;
1441 if (p == endp)
1442 return nbytes;
1444 to = p;
1445 nbytes = endp - p;
1446 endp = str + len;
1447 safe_bcopy (p, endp - nbytes, nbytes);
1448 p = endp - nbytes;
1449 while (p < endp)
1451 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1453 while (n--)
1454 *to++ = *p++;
1456 else
1458 *to++ = LEADING_CODE_8_BIT_CONTROL;
1459 *to++ = *p++ + 0x20;
1461 chars++;
1463 if (nchars)
1464 *nchars = chars;
1465 return (to - str);
1468 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1469 that contains the same single-byte characters. It actually
1470 converts all 8-bit characters to multibyte forms. It is assured
1471 that we can use LEN bytes at STR as a work area and that is
1472 enough. */
1475 str_to_multibyte (str, len, bytes)
1476 unsigned char *str;
1477 int len, bytes;
1479 unsigned char *p = str, *endp = str + bytes;
1480 unsigned char *to;
1481 int c;
1483 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1484 if (p == endp)
1485 return bytes;
1486 to = p;
1487 bytes = endp - p;
1488 endp = str + len;
1489 safe_bcopy (p, endp - bytes, bytes);
1490 p = endp - bytes;
1491 while (p < endp)
1493 if (*p < 0x80 || *p >= 0xA0)
1494 *to++ = *p++;
1495 else
1496 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1498 return (to - str);
1501 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1502 actually converts only 8-bit characters in the range 0x80..0x9F to
1503 unibyte forms. */
1506 str_as_unibyte (str, bytes)
1507 unsigned char *str;
1508 int bytes;
1510 unsigned char *p = str, *endp = str + bytes;
1511 unsigned char *to = str;
1513 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1514 to = p;
1515 while (p < endp)
1517 if (*p == LEADING_CODE_8_BIT_CONTROL)
1518 *to++ = *(p + 1) - 0x20, p += 2;
1519 else
1520 *to++ = *p++;
1522 return (to - str);
1526 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1527 "Concatenate all the argument characters and make the result a string.")
1528 (n, args)
1529 int n;
1530 Lisp_Object *args;
1532 int i;
1533 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1534 unsigned char *p = buf;
1535 int c;
1537 for (i = 0; i < n; i++)
1539 CHECK_NUMBER (args[i], 0);
1540 c = XINT (args[i]);
1541 p += CHAR_STRING (c, p);
1544 return make_string_from_bytes (buf, n, p - buf);
1547 #endif /* emacs */
1550 charset_id_internal (charset_name)
1551 char *charset_name;
1553 Lisp_Object val;
1555 val= Fget (intern (charset_name), Qcharset);
1556 if (!VECTORP (val))
1557 error ("Charset %s is not defined", charset_name);
1559 return (XINT (XVECTOR (val)->contents[0]));
1562 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1563 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1566 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1567 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1568 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1569 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1570 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1571 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1572 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1573 return Qnil;
1576 void
1577 init_charset_once ()
1579 int i, j, k;
1581 staticpro (&Vcharset_table);
1582 staticpro (&Vcharset_symbol_table);
1583 staticpro (&Vgeneric_character_list);
1585 /* This has to be done here, before we call Fmake_char_table. */
1586 Qcharset_table = intern ("charset-table");
1587 staticpro (&Qcharset_table);
1589 /* Intern this now in case it isn't already done.
1590 Setting this variable twice is harmless.
1591 But don't staticpro it here--that is done in alloc.c. */
1592 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1594 /* Now we are ready to set up this property, so we can
1595 create the charset table. */
1596 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1597 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1599 Qunknown = intern ("unknown");
1600 staticpro (&Qunknown);
1601 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1602 Qunknown);
1604 /* Setup tables. */
1605 for (i = 0; i < 2; i++)
1606 for (j = 0; j < 2; j++)
1607 for (k = 0; k < 128; k++)
1608 iso_charset_table [i][j][k] = -1;
1610 for (i = 0; i < 256; i++)
1611 bytes_by_char_head[i] = 1;
1612 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1613 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1614 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1615 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1617 for (i = 0; i < 128; i++)
1618 width_by_char_head[i] = 1;
1619 for (; i < 256; i++)
1620 width_by_char_head[i] = 4;
1621 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1622 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1623 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1624 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1627 Lisp_Object val;
1629 val = Qnil;
1630 for (i = 0x81; i < 0x90; i++)
1631 val = Fcons (make_number ((i - 0x70) << 7), val);
1632 for (; i < 0x9A; i++)
1633 val = Fcons (make_number ((i - 0x8F) << 14), val);
1634 for (i = 0xA0; i < 0xF0; i++)
1635 val = Fcons (make_number ((i - 0x70) << 7), val);
1636 for (; i < 0xFF; i++)
1637 val = Fcons (make_number ((i - 0xE0) << 14), val);
1638 Vgeneric_character_list = Fnreverse (val);
1641 nonascii_insert_offset = 0;
1642 Vnonascii_translation_table = Qnil;
1645 #ifdef emacs
1647 void
1648 syms_of_charset ()
1650 Qcharset = intern ("charset");
1651 staticpro (&Qcharset);
1653 Qascii = intern ("ascii");
1654 staticpro (&Qascii);
1656 Qeight_bit_control = intern ("eight-bit-control");
1657 staticpro (&Qeight_bit_control);
1659 Qeight_bit_graphic = intern ("eight-bit-graphic");
1660 staticpro (&Qeight_bit_graphic);
1662 /* Define special charsets ascii, eight-bit-control, and
1663 eight-bit-graphic. */
1664 update_charset_table (make_number (CHARSET_ASCII),
1665 make_number (1), make_number (94),
1666 make_number (1),
1667 make_number (0),
1668 make_number ('B'),
1669 make_number (0),
1670 build_string ("ASCII"),
1671 Qnil, /* same as above */
1672 build_string ("ASCII (ISO646 IRV)"));
1673 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1674 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1676 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1677 make_number (1), make_number (96),
1678 make_number (4),
1679 make_number (0),
1680 make_number (-1),
1681 make_number (-1),
1682 build_string ("8-bit control code (0x80..0x9F)"),
1683 Qnil, /* same as above */
1684 Qnil); /* same as above */
1685 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1686 Fput (Qeight_bit_control, Qcharset,
1687 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1689 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1690 make_number (1), make_number (96),
1691 make_number (4),
1692 make_number (0),
1693 make_number (-1),
1694 make_number (-1),
1695 build_string ("8-bit graphic char (0xA0..0xFF)"),
1696 Qnil, /* same as above */
1697 Qnil); /* same as above */
1698 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1699 Fput (Qeight_bit_graphic, Qcharset,
1700 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1702 Qauto_fill_chars = intern ("auto-fill-chars");
1703 staticpro (&Qauto_fill_chars);
1704 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1706 defsubr (&Sdefine_charset);
1707 defsubr (&Sgeneric_character_list);
1708 defsubr (&Sget_unused_iso_final_char);
1709 defsubr (&Sdeclare_equiv_charset);
1710 defsubr (&Sfind_charset_region);
1711 defsubr (&Sfind_charset_string);
1712 defsubr (&Smake_char_internal);
1713 defsubr (&Ssplit_char);
1714 defsubr (&Schar_charset);
1715 defsubr (&Scharset_after);
1716 defsubr (&Siso_charset);
1717 defsubr (&Schar_valid_p);
1718 defsubr (&Sunibyte_char_to_multibyte);
1719 defsubr (&Smultibyte_char_to_unibyte);
1720 defsubr (&Schar_bytes);
1721 defsubr (&Schar_width);
1722 defsubr (&Sstring_width);
1723 defsubr (&Schar_direction);
1724 defsubr (&Schars_in_region);
1725 defsubr (&Sstring);
1726 defsubr (&Ssetup_special_charsets);
1728 DEFVAR_LISP ("charset-list", &Vcharset_list,
1729 "List of charsets ever defined.");
1730 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1731 Fcons (Qeight_bit_graphic, Qnil)));
1733 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1734 "Vector of cons cell of a symbol and translation table ever defined.\n\
1735 An ID of a translation table is an index of this vector.");
1736 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1738 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1739 "Leading-code of private TYPE9N charset of column-width 1.");
1740 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1742 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1743 "Leading-code of private TYPE9N charset of column-width 2.");
1744 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1746 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1747 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1748 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1750 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1751 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1752 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1754 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1755 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1756 This is used for converting unibyte text to multibyte,\n\
1757 and for inserting character codes specified by number.\n\n\
1758 This serves to convert a Latin-1 or similar 8-bit character code\n\
1759 to the corresponding Emacs multibyte character code.\n\
1760 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1761 for your choice of character set.\n\
1762 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1763 nonascii_insert_offset = 0;
1765 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1766 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1767 This is used for converting unibyte text to multibyte,\n\
1768 and for inserting character codes specified by number.\n\n\
1769 Conversion is performed only when multibyte characters are enabled,\n\
1770 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1771 to the corresponding Emacs character code.\n\n\
1772 If this is nil, `nonascii-insert-offset' is used instead.\n\
1773 See also the docstring of `make-translation-table'.");
1774 Vnonascii_translation_table = Qnil;
1776 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1777 "A char-table for characters which invoke auto-filling.\n\
1778 Such characters have value t in this table.");
1779 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1780 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1781 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1784 #endif /* emacs */