(GZIP_PROG): Renamed from GZIP.
[emacs.git] / src / charset.c
blobb4f84a9e7854e91d259c2dbbb81e5b63c0316189
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA. */
23 /* At first, see the document in `charset.h' to understand the code in
24 this file. */
26 #ifdef emacs
27 #include <config.h>
28 #endif
30 #include <stdio.h>
32 #ifdef emacs
34 #include <sys/types.h>
35 #include "lisp.h"
36 #include "buffer.h"
37 #include "charset.h"
38 #include "composite.h"
39 #include "coding.h"
40 #include "disptab.h"
42 #else /* not emacs */
44 #include "mulelib.h"
46 #endif /* emacs */
48 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
49 Lisp_Object Qunknown;
51 /* Declaration of special leading-codes. */
52 EMACS_INT leading_code_private_11; /* for private DIMENSION1 of 1-column */
53 EMACS_INT leading_code_private_12; /* for private DIMENSION1 of 2-column */
54 EMACS_INT leading_code_private_21; /* for private DIMENSION2 of 1-column */
55 EMACS_INT leading_code_private_22; /* for private DIMENSION2 of 2-column */
57 /* Declaration of special charsets. The values are set by
58 Fsetup_special_charsets. */
59 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
60 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
61 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
62 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
63 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
64 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
65 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
66 int charset_mule_unicode_0100_24ff;
67 int charset_mule_unicode_2500_33ff;
68 int charset_mule_unicode_e000_ffff;
70 Lisp_Object Qcharset_table;
72 /* A char-table containing information of each character set. */
73 Lisp_Object Vcharset_table;
75 /* A vector of charset symbol indexed by charset-id. This is used
76 only for returning charset symbol from C functions. */
77 Lisp_Object Vcharset_symbol_table;
79 /* A list of charset symbols ever defined. */
80 Lisp_Object Vcharset_list;
82 /* Vector of translation table ever defined.
83 ID of a translation table is used to index this vector. */
84 Lisp_Object Vtranslation_table_vector;
86 /* A char-table for characters which may invoke auto-filling. */
87 Lisp_Object Vauto_fill_chars;
89 Lisp_Object Qauto_fill_chars;
91 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
92 int bytes_by_char_head[256];
93 int width_by_char_head[256];
95 /* Mapping table from ISO2022's charset (specified by DIMENSION,
96 CHARS, and FINAL-CHAR) to Emacs' charset. */
97 int iso_charset_table[2][2][128];
99 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
100 unsigned char *_fetch_multibyte_char_p;
101 int _fetch_multibyte_char_len;
103 /* Offset to add to a non-ASCII value when inserting it. */
104 EMACS_INT nonascii_insert_offset;
106 /* Translation table for converting non-ASCII unibyte characters
107 to multibyte codes, or nil. */
108 Lisp_Object Vnonascii_translation_table;
110 /* List of all possible generic characters. */
111 Lisp_Object Vgeneric_character_list;
114 void
115 invalid_character (c)
116 int c;
118 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
121 /* Parse string STR of length LENGTH and fetch information of a
122 character at STR. Set BYTES to the byte length the character
123 occupies, CHARSET, C1, C2 to proper values of the character. */
125 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
126 do { \
127 (c1) = *(str); \
128 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
129 if ((bytes) == 1) \
130 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
131 else if ((bytes) == 2) \
133 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
134 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
135 else \
136 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
138 else if ((bytes) == 3) \
140 if ((c1) < LEADING_CODE_PRIVATE_11) \
141 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
142 else \
143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
145 else \
146 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
147 } while (0)
149 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
150 Note that this intentionally allows invalid components, such
151 as 0xA0 0xA0, because there exist many files that contain
152 such invalid byte sequences, especially in EUC-GB. */
153 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
154 ((charset) == CHARSET_ASCII \
155 ? ((c1) >= 0 && (c1) <= 0x7F) \
156 : ((charset) == CHARSET_8_BIT_CONTROL \
157 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
158 : ((charset) == CHARSET_8_BIT_GRAPHIC \
159 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
160 : (CHARSET_DIMENSION (charset) == 1 \
161 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
162 : ((c1) >= 0x20 && (c1) <= 0x7F \
163 && (c2) >= 0x20 && (c2) <= 0x7F)))))
165 /* Store multi-byte form of the character C in STR. The caller should
166 allocate at least 4-byte area at STR in advance. Returns the
167 length of the multi-byte form. If C is an invalid character code,
168 return -1. */
171 char_to_string_1 (c, str)
172 int c;
173 unsigned char *str;
175 unsigned char *p = str;
177 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
179 /* Multibyte character can't have a modifier bit. */
180 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
181 return -1;
183 /* For Meta, Shift, and Control modifiers, we need special care. */
184 if (c & CHAR_META)
186 /* Move the meta bit to the right place for a string. */
187 c = (c & ~CHAR_META) | 0x80;
189 if (c & CHAR_SHIFT)
191 /* Shift modifier is valid only with [A-Za-z]. */
192 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
193 c &= ~CHAR_SHIFT;
194 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
195 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
197 if (c & CHAR_CTL)
199 /* Simulate the code in lread.c. */
200 /* Allow `\C- ' and `\C-?'. */
201 if (c == (CHAR_CTL | ' '))
202 c = 0;
203 else if (c == (CHAR_CTL | '?'))
204 c = 127;
205 /* ASCII control chars are made from letters (both cases),
206 as well as the non-letters within 0100...0137. */
207 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
208 c &= (037 | (~0177 & ~CHAR_CTL));
209 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
210 c &= (037 | (~0177 & ~CHAR_CTL));
213 /* If C still has any modifier bits, just ignore it. */
214 c &= ~CHAR_MODIFIER_MASK;
217 if (SINGLE_BYTE_CHAR_P (c))
219 if (ASCII_BYTE_P (c) || c >= 0xA0)
220 *p++ = c;
221 else
223 *p++ = LEADING_CODE_8_BIT_CONTROL;
224 *p++ = c + 0x20;
227 else if (CHAR_VALID_P (c, 0))
229 int charset, c1, c2;
231 SPLIT_CHAR (c, charset, c1, c2);
233 if (charset >= LEADING_CODE_EXT_11)
234 *p++ = (charset < LEADING_CODE_EXT_12
235 ? LEADING_CODE_PRIVATE_11
236 : (charset < LEADING_CODE_EXT_21
237 ? LEADING_CODE_PRIVATE_12
238 : (charset < LEADING_CODE_EXT_22
239 ? LEADING_CODE_PRIVATE_21
240 : LEADING_CODE_PRIVATE_22)));
241 *p++ = charset;
242 if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32))
243 return -1;
244 if (c1)
246 *p++ = c1 | 0x80;
247 if (c2 > 0)
248 *p++ = c2 | 0x80;
251 else
252 return -1;
254 return (p - str);
258 /* Store multi-byte form of the character C in STR. The caller should
259 allocate at least 4-byte area at STR in advance. Returns the
260 length of the multi-byte form. If C is an invalid character code,
261 signal an error.
263 Use macro `CHAR_STRING (C, STR)' instead of calling this function
264 directly if C can be an ASCII character. */
267 char_to_string (c, str)
268 int c;
269 unsigned char *str;
271 int len;
272 len = char_to_string_1 (c, str);
273 if (len == -1)
274 invalid_character (c);
275 return len;
279 /* Return the non-ASCII character corresponding to multi-byte form at
280 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
281 length of the multibyte form in *ACTUAL_LEN.
283 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
284 this function directly if you want ot handle ASCII characters as
285 well. */
288 string_to_char (str, len, actual_len)
289 const unsigned char *str;
290 int len, *actual_len;
292 int c, bytes, charset, c1, c2;
294 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
295 c = MAKE_CHAR (charset, c1, c2);
296 if (actual_len)
297 *actual_len = bytes;
298 return c;
301 /* Return the length of the multi-byte form at string STR of length LEN.
302 Use the macro MULTIBYTE_FORM_LENGTH instead. */
304 multibyte_form_length (str, len)
305 const unsigned char *str;
306 int len;
308 int bytes;
310 PARSE_MULTIBYTE_SEQ (str, len, bytes);
311 return bytes;
314 /* Check multibyte form at string STR of length LEN and set variables
315 pointed by CHARSET, C1, and C2 to charset and position codes of the
316 character at STR, and return 0. If there's no multibyte character,
317 return -1. This should be used only in the macro SPLIT_STRING
318 which checks range of STR in advance. */
321 split_string (str, len, charset, c1, c2)
322 const unsigned char *str;
323 unsigned char *c1, *c2;
324 int len, *charset;
326 register int bytes, cs, code1, code2 = -1;
328 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
329 if (cs == CHARSET_ASCII)
330 return -1;
331 *charset = cs;
332 *c1 = code1;
333 *c2 = code2;
334 return 0;
337 /* Return 1 iff character C has valid printable glyph.
338 Use the macro CHAR_PRINTABLE_P instead. */
340 char_printable_p (c)
341 int c;
343 int charset, c1, c2;
345 if (ASCII_BYTE_P (c))
346 return 1;
347 else if (SINGLE_BYTE_CHAR_P (c))
348 return 0;
349 else if (c >= MAX_CHAR)
350 return 0;
352 SPLIT_CHAR (c, charset, c1, c2);
353 if (! CHARSET_DEFINED_P (charset))
354 return 0;
355 if (CHARSET_CHARS (charset) == 94
356 ? c1 <= 32 || c1 >= 127
357 : c1 < 32)
358 return 0;
359 if (CHARSET_DIMENSION (charset) == 2
360 && (CHARSET_CHARS (charset) == 94
361 ? c2 <= 32 || c2 >= 127
362 : c2 < 32))
363 return 0;
364 return 1;
367 /* Translate character C by translation table TABLE. If C
368 is negative, translate a character specified by CHARSET, C1, and C2
369 (C1 and C2 are code points of the character). If no translation is
370 found in TABLE, return C. */
372 translate_char (table, c, charset, c1, c2)
373 Lisp_Object table;
374 int c, charset, c1, c2;
376 Lisp_Object ch;
377 int alt_charset, alt_c1, alt_c2, dimension;
379 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
380 if (!CHAR_TABLE_P (table)
381 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
382 return c;
384 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
385 dimension = CHARSET_DIMENSION (alt_charset);
386 if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0))
387 /* CH is not a generic character, just return it. */
388 return XFASTINT (ch);
390 /* Since CH is a generic character, we must return a specific
391 charater which has the same position codes as C from CH. */
392 if (charset < 0)
393 SPLIT_CHAR (c, charset, c1, c2);
394 if (dimension != CHARSET_DIMENSION (charset))
395 /* We can't make such a character because of dimension mismatch. */
396 return c;
397 return MAKE_CHAR (alt_charset, c1, c2);
400 /* Convert the unibyte character C to multibyte based on
401 Vnonascii_translation_table or nonascii_insert_offset. If they can't
402 convert C to a valid multibyte character, convert it based on
403 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
406 unibyte_char_to_multibyte (c)
407 int c;
409 if (c < 0400 && c >= 0200)
411 int c_save = c;
413 if (! NILP (Vnonascii_translation_table))
415 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
416 if (c >= 0400 && ! char_valid_p (c, 0))
417 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
419 else if (c >= 0240 && nonascii_insert_offset > 0)
421 c += nonascii_insert_offset;
422 if (c < 0400 || ! char_valid_p (c, 0))
423 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
425 else if (c >= 0240)
426 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
428 return c;
432 /* Convert the multibyte character C to unibyte 8-bit character based
433 on Vnonascii_translation_table or nonascii_insert_offset. If
434 REV_TBL is non-nil, it should be a reverse table of
435 Vnonascii_translation_table, i.e. what given by:
436 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
439 multibyte_char_to_unibyte (c, rev_tbl)
440 int c;
441 Lisp_Object rev_tbl;
443 if (!SINGLE_BYTE_CHAR_P (c))
445 int c_save = c;
447 if (! CHAR_TABLE_P (rev_tbl)
448 && CHAR_TABLE_P (Vnonascii_translation_table))
449 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
450 make_number (0));
451 if (CHAR_TABLE_P (rev_tbl))
453 Lisp_Object temp;
454 temp = Faref (rev_tbl, make_number (c));
455 if (INTEGERP (temp))
456 c = XINT (temp);
457 if (c >= 256)
458 c = (c_save & 0177) + 0200;
460 else
462 if (nonascii_insert_offset > 0)
463 c -= nonascii_insert_offset;
464 if (c < 128 || c >= 256)
465 c = (c_save & 0177) + 0200;
469 return c;
473 /* Update the table Vcharset_table with the given arguments (see the
474 document of `define-charset' for the meaning of each argument).
475 Several other table contents are also updated. The caller should
476 check the validity of CHARSET-ID and the remaining arguments in
477 advance. */
479 void
480 update_charset_table (charset_id, dimension, chars, width, direction,
481 iso_final_char, iso_graphic_plane,
482 short_name, long_name, description)
483 Lisp_Object charset_id, dimension, chars, width, direction;
484 Lisp_Object iso_final_char, iso_graphic_plane;
485 Lisp_Object short_name, long_name, description;
487 int charset = XINT (charset_id);
488 int bytes;
489 unsigned char leading_code_base, leading_code_ext;
491 if (NILP (CHARSET_TABLE_ENTRY (charset)))
492 CHARSET_TABLE_ENTRY (charset)
493 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
495 if (NILP (long_name))
496 long_name = short_name;
497 if (NILP (description))
498 description = long_name;
500 /* Get byte length of multibyte form, base leading-code, and
501 extended leading-code of the charset. See the comment under the
502 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
503 bytes = XINT (dimension);
504 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
506 /* Official charset, it doesn't have an extended leading-code. */
507 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
508 bytes += 1; /* For a base leading-code. */
509 leading_code_base = charset;
510 leading_code_ext = 0;
512 else
514 /* Private charset. */
515 bytes += 2; /* For base and extended leading-codes. */
516 leading_code_base
517 = (charset < LEADING_CODE_EXT_12
518 ? LEADING_CODE_PRIVATE_11
519 : (charset < LEADING_CODE_EXT_21
520 ? LEADING_CODE_PRIVATE_12
521 : (charset < LEADING_CODE_EXT_22
522 ? LEADING_CODE_PRIVATE_21
523 : LEADING_CODE_PRIVATE_22)));
524 leading_code_ext = charset;
525 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
526 error ("Invalid dimension for the charset-ID %d", charset);
529 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
530 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
531 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
532 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
533 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
534 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
535 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
536 = make_number (leading_code_base);
537 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
538 = make_number (leading_code_ext);
539 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
540 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
541 = iso_graphic_plane;
542 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
543 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
544 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
545 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
548 /* If we have already defined a charset which has the same
549 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
550 DIRECTION, we must update the entry REVERSE-CHARSET of both
551 charsets. If there's no such charset, the value of the entry
552 is set to nil. */
553 int i;
555 for (i = 0; i <= MAX_CHARSET; i++)
556 if (!NILP (CHARSET_TABLE_ENTRY (i)))
558 if (CHARSET_DIMENSION (i) == XINT (dimension)
559 && CHARSET_CHARS (i) == XINT (chars)
560 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
561 && CHARSET_DIRECTION (i) != XINT (direction))
563 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
564 = make_number (i);
565 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
566 break;
569 if (i > MAX_CHARSET)
570 /* No such a charset. */
571 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
572 = make_number (-1);
575 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
576 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
578 bytes_by_char_head[leading_code_base] = bytes;
579 width_by_char_head[leading_code_base] = XINT (width);
581 /* Update table emacs_code_class. */
582 emacs_code_class[charset] = (bytes == 2
583 ? EMACS_leading_code_2
584 : (bytes == 3
585 ? EMACS_leading_code_3
586 : EMACS_leading_code_4));
589 /* Update table iso_charset_table. */
590 if (XINT (iso_final_char) >= 0
591 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
592 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
595 #ifdef emacs
597 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
598 is invalid. */
600 get_charset_id (charset_symbol)
601 Lisp_Object charset_symbol;
603 Lisp_Object val;
604 int charset;
606 /* This originally used a ?: operator, but reportedly the HP-UX
607 compiler version HP92453-01 A.10.32.22 miscompiles that. */
608 if (SYMBOLP (charset_symbol)
609 && VECTORP (val = Fget (charset_symbol, Qcharset))
610 && CHARSET_VALID_P (charset =
611 XINT (XVECTOR (val)->contents[CHARSET_ID_IDX])))
612 return charset;
613 else
614 return -1;
617 /* Return an identification number for a new private charset of
618 DIMENSION and WIDTH. If there's no more room for the new charset,
619 return 0. */
620 Lisp_Object
621 get_new_private_charset_id (dimension, width)
622 int dimension, width;
624 int charset, from, to;
626 if (dimension == 1)
628 from = LEADING_CODE_EXT_11;
629 to = LEADING_CODE_EXT_21;
631 else
633 from = LEADING_CODE_EXT_21;
634 to = LEADING_CODE_EXT_MAX + 1;
637 for (charset = from; charset < to; charset++)
638 if (!CHARSET_DEFINED_P (charset)) break;
640 return make_number (charset < to ? charset : 0);
643 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
644 doc: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
645 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
646 treated as a private charset.
647 INFO-VECTOR is a vector of the format:
648 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
649 SHORT-NAME LONG-NAME DESCRIPTION]
650 The meanings of each elements is as follows:
651 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
652 CHARS (integer) is the number of characters in a dimension: 94 or 96.
653 WIDTH (integer) is the number of columns a character in the charset
654 occupies on the screen: one of 0, 1, and 2.
656 DIRECTION (integer) is the rendering direction of characters in the
657 charset when rendering. If 0, render from left to right, else
658 render from right to left.
660 ISO-FINAL-CHAR (character) is the final character of the
661 corresponding ISO 2022 charset.
662 It may be -1 if the charset is internal use only.
664 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
665 while encoding to variants of ISO 2022 coding system, one of the
666 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
667 It may be -1 if the charset is internal use only.
669 SHORT-NAME (string) is the short name to refer to the charset.
671 LONG-NAME (string) is the long name to refer to the charset.
673 DESCRIPTION (string) is the description string of the charset. */)
674 (charset_id, charset_symbol, info_vector)
675 Lisp_Object charset_id, charset_symbol, info_vector;
677 Lisp_Object *vec;
679 if (!NILP (charset_id))
680 CHECK_NUMBER (charset_id);
681 CHECK_SYMBOL (charset_symbol);
682 CHECK_VECTOR (info_vector);
684 if (! NILP (charset_id))
686 if (! CHARSET_VALID_P (XINT (charset_id)))
687 error ("Invalid CHARSET: %d", XINT (charset_id));
688 else if (CHARSET_DEFINED_P (XINT (charset_id)))
689 error ("Already defined charset: %d", XINT (charset_id));
692 vec = XVECTOR (info_vector)->contents;
693 if (XVECTOR (info_vector)->size != 9
694 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
695 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
696 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
697 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
698 || !INTEGERP (vec[4])
699 || !(XINT (vec[4]) == -1 || (XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~'))
700 || !INTEGERP (vec[5])
701 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
702 || !STRINGP (vec[6])
703 || !STRINGP (vec[7])
704 || !STRINGP (vec[8]))
705 error ("Invalid info-vector argument for defining charset %s",
706 SDATA (SYMBOL_NAME (charset_symbol)));
708 if (NILP (charset_id))
710 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
711 if (XINT (charset_id) == 0)
712 error ("There's no room for a new private charset %s",
713 SDATA (SYMBOL_NAME (charset_symbol)));
716 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
717 vec[4], vec[5], vec[6], vec[7], vec[8]);
718 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
719 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
720 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
721 Fupdate_coding_systems_internal ();
722 return Qnil;
725 DEFUN ("generic-character-list", Fgeneric_character_list,
726 Sgeneric_character_list, 0, 0, 0,
727 doc: /* Return a list of all possible generic characters.
728 It includes a generic character for a charset not yet defined. */)
731 return Vgeneric_character_list;
734 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
735 Sget_unused_iso_final_char, 2, 2, 0,
736 doc: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
737 DIMENSION is the number of bytes to represent a character: 1 or 2.
738 CHARS is the number of characters in a dimension: 94 or 96.
740 This final char is for private use, thus the range is `0' (48) .. `?' (63).
741 If there's no unused final char for the specified kind of charset,
742 return nil. */)
743 (dimension, chars)
744 Lisp_Object dimension, chars;
746 int final_char;
748 CHECK_NUMBER (dimension);
749 CHECK_NUMBER (chars);
750 if (XINT (dimension) != 1 && XINT (dimension) != 2)
751 error ("Invalid charset dimension %d, it should be 1 or 2",
752 XINT (dimension));
753 if (XINT (chars) != 94 && XINT (chars) != 96)
754 error ("Invalid charset chars %d, it should be 94 or 96",
755 XINT (chars));
756 for (final_char = '0'; final_char <= '?'; final_char++)
758 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
759 break;
761 return (final_char <= '?' ? make_number (final_char) : Qnil);
764 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
765 4, 4, 0,
766 doc: /* Declare an equivalent charset for ISO-2022 decoding.
768 On decoding by an ISO-2022 base coding system, when a charset
769 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
770 if CHARSET is designated instead. */)
771 (dimension, chars, final_char, charset)
772 Lisp_Object dimension, chars, final_char, charset;
774 int charset_id;
776 CHECK_NUMBER (dimension);
777 CHECK_NUMBER (chars);
778 CHECK_NUMBER (final_char);
779 CHECK_SYMBOL (charset);
781 if (XINT (dimension) != 1 && XINT (dimension) != 2)
782 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
783 if (XINT (chars) != 94 && XINT (chars) != 96)
784 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
785 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
786 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
787 if ((charset_id = get_charset_id (charset)) < 0)
788 error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset)));
790 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset_id;
791 return Qnil;
794 /* Return information about charsets in the text at PTR of NBYTES
795 bytes, which are NCHARS characters. The value is:
797 0: Each character is represented by one byte. This is always
798 true for unibyte text.
799 1: No charsets other than ascii eight-bit-control,
800 eight-bit-graphic, and latin-1 are found.
801 2: Otherwise.
803 In addition, if CHARSETS is nonzero, for each found charset N, set
804 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
805 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
806 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
807 1 (note that there's no charset whose ID is 1). */
810 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
811 const unsigned char *ptr;
812 int nchars, nbytes, *charsets;
813 Lisp_Object table;
815 if (nchars == nbytes)
817 if (charsets && nbytes > 0)
819 const unsigned char *endp = ptr + nbytes;
820 int maskbits = 0;
822 while (ptr < endp && maskbits != 7)
824 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
825 ptr++;
828 if (maskbits & 1)
829 charsets[CHARSET_ASCII] = 1;
830 if (maskbits & 2)
831 charsets[CHARSET_8_BIT_CONTROL] = 1;
832 if (maskbits & 4)
833 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
835 return 0;
837 else
839 int return_val = 1;
840 int bytes, charset, c1, c2;
842 if (! CHAR_TABLE_P (table))
843 table = Qnil;
845 while (nchars-- > 0)
847 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
848 ptr += bytes;
850 if (!CHARSET_DEFINED_P (charset))
851 charset = 1;
852 else if (! NILP (table))
854 int c = translate_char (table, -1, charset, c1, c2);
855 if (c >= 0)
856 charset = CHAR_CHARSET (c);
859 if (return_val == 1
860 && charset != CHARSET_ASCII
861 && charset != CHARSET_8_BIT_CONTROL
862 && charset != CHARSET_8_BIT_GRAPHIC
863 && charset != charset_latin_iso8859_1)
864 return_val = 2;
866 if (charsets)
867 charsets[charset] = 1;
868 else if (return_val == 2)
869 break;
871 return return_val;
875 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
876 2, 3, 0,
877 doc: /* Return a list of charsets in the region between BEG and END.
878 BEG and END are buffer positions.
879 Optional arg TABLE if non-nil is a translation table to look up.
881 If the region contains invalid multibyte characters,
882 `unknown' is included in the returned list.
884 If the current buffer is unibyte, the returned list may contain
885 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
886 (beg, end, table)
887 Lisp_Object beg, end, table;
889 int charsets[MAX_CHARSET + 1];
890 int from, from_byte, to, stop, stop_byte, i;
891 Lisp_Object val;
893 validate_region (&beg, &end);
894 from = XFASTINT (beg);
895 stop = to = XFASTINT (end);
897 if (from < GPT && GPT < to)
899 stop = GPT;
900 stop_byte = GPT_BYTE;
902 else
903 stop_byte = CHAR_TO_BYTE (stop);
905 from_byte = CHAR_TO_BYTE (from);
907 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
908 while (1)
910 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
911 stop_byte - from_byte, charsets, table);
912 if (stop < to)
914 from = stop, from_byte = stop_byte;
915 stop = to, stop_byte = CHAR_TO_BYTE (stop);
917 else
918 break;
921 val = Qnil;
922 if (charsets[1])
923 val = Fcons (Qunknown, val);
924 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
925 if (charsets[i])
926 val = Fcons (CHARSET_SYMBOL (i), val);
927 if (charsets[0])
928 val = Fcons (Qascii, val);
929 return val;
932 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
933 1, 2, 0,
934 doc: /* Return a list of charsets in STR.
935 Optional arg TABLE if non-nil is a translation table to look up.
937 If the string contains invalid multibyte characters,
938 `unknown' is included in the returned list.
940 If STR is unibyte, the returned list may contain
941 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
942 (str, table)
943 Lisp_Object str, table;
945 int charsets[MAX_CHARSET + 1];
946 int i;
947 Lisp_Object val;
949 CHECK_STRING (str);
951 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
952 find_charset_in_text (SDATA (str), SCHARS (str),
953 SBYTES (str), charsets, table);
955 val = Qnil;
956 if (charsets[1])
957 val = Fcons (Qunknown, val);
958 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
959 if (charsets[i])
960 val = Fcons (CHARSET_SYMBOL (i), val);
961 if (charsets[0])
962 val = Fcons (Qascii, val);
963 return val;
967 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
968 doc: /* Return a character made from arguments.
969 Internal use only. */)
970 (charset, code1, code2)
971 Lisp_Object charset, code1, code2;
973 int charset_id, c1, c2;
975 CHECK_NUMBER (charset);
976 charset_id = XINT (charset);
977 if (!CHARSET_DEFINED_P (charset_id))
978 error ("Invalid charset ID: %d", XINT (charset));
980 if (NILP (code1))
981 c1 = 0;
982 else
984 CHECK_NUMBER (code1);
985 c1 = XINT (code1);
987 if (NILP (code2))
988 c2 = 0;
989 else
991 CHECK_NUMBER (code2);
992 c2 = XINT (code2);
995 if (charset_id == CHARSET_ASCII)
997 if (c1 < 0 || c1 > 0x7F)
998 goto invalid_code_posints;
999 return make_number (c1);
1001 else if (charset_id == CHARSET_8_BIT_CONTROL)
1003 if (NILP (code1))
1004 c1 = 0x80;
1005 else if (c1 < 0x80 || c1 > 0x9F)
1006 goto invalid_code_posints;
1007 return make_number (c1);
1009 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
1011 if (NILP (code1))
1012 c1 = 0xA0;
1013 else if (c1 < 0xA0 || c1 > 0xFF)
1014 goto invalid_code_posints;
1015 return make_number (c1);
1017 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1018 goto invalid_code_posints;
1019 c1 &= 0x7F;
1020 c2 &= 0x7F;
1021 if (c1 == 0
1022 ? c2 != 0
1023 : (c2 == 0
1024 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
1025 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
1026 goto invalid_code_posints;
1027 return make_number (MAKE_CHAR (charset_id, c1, c2));
1029 invalid_code_posints:
1030 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1033 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1034 doc: /* Return list of charset and one or two position-codes of CH.
1035 If CH is invalid as a character code,
1036 return a list of symbol `unknown' and CH. */)
1037 (ch)
1038 Lisp_Object ch;
1040 int c, charset, c1, c2;
1042 CHECK_NUMBER (ch);
1043 c = XFASTINT (ch);
1044 if (!CHAR_VALID_P (c, 1))
1045 return Fcons (Qunknown, Fcons (ch, Qnil));
1046 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1047 return (c2 >= 0
1048 ? Fcons (CHARSET_SYMBOL (charset),
1049 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1050 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1053 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1054 doc: /* Return charset of CH. */)
1055 (ch)
1056 Lisp_Object ch;
1058 CHECK_NUMBER (ch);
1060 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1063 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1064 doc: /* Return charset of a character in the current buffer at position POS.
1065 If POS is nil, it defauls to the current point.
1066 If POS is out of range, the value is nil. */)
1067 (pos)
1068 Lisp_Object pos;
1070 Lisp_Object ch;
1071 int charset;
1073 ch = Fchar_after (pos);
1074 if (! INTEGERP (ch))
1075 return ch;
1076 charset = CHAR_CHARSET (XINT (ch));
1077 return CHARSET_SYMBOL (charset);
1080 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1081 doc: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1083 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1084 by their DIMENSION, CHARS, and FINAL-CHAR,
1085 where as Emacs distinguishes them by charset symbol.
1086 See the documentation of the function `charset-info' for the meanings of
1087 DIMENSION, CHARS, and FINAL-CHAR. */)
1088 (dimension, chars, final_char)
1089 Lisp_Object dimension, chars, final_char;
1091 int charset;
1093 CHECK_NUMBER (dimension);
1094 CHECK_NUMBER (chars);
1095 CHECK_NUMBER (final_char);
1097 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1098 return Qnil;
1099 return CHARSET_SYMBOL (charset);
1102 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1103 generic character. If GENERICP is zero, return nonzero iff C is a
1104 valid normal character. Do not call this function directly,
1105 instead use macro CHAR_VALID_P. */
1107 char_valid_p (c, genericp)
1108 int c, genericp;
1110 int charset, c1, c2;
1112 if (c < 0 || c >= MAX_CHAR)
1113 return 0;
1114 if (SINGLE_BYTE_CHAR_P (c))
1115 return 1;
1116 SPLIT_CHAR (c, charset, c1, c2);
1117 if (genericp)
1119 if (c1)
1121 if (c2 <= 0) c2 = 0x20;
1123 else
1125 if (c2 <= 0) c1 = c2 = 0x20;
1128 return (CHARSET_DEFINED_P (charset)
1129 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1132 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1133 doc: /* Return t if OBJECT is a valid normal character.
1134 If optional arg GENERICP is non-nil, also return t if OBJECT is
1135 a valid generic character. */)
1136 (object, genericp)
1137 Lisp_Object object, genericp;
1139 if (! NATNUMP (object))
1140 return Qnil;
1141 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1144 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1145 Sunibyte_char_to_multibyte, 1, 1, 0,
1146 doc: /* Convert the unibyte character CH to multibyte character.
1147 The conversion is done based on `nonascii-translation-table' (which see)
1148 or `nonascii-insert-offset' (which see). */)
1149 (ch)
1150 Lisp_Object ch;
1152 int c;
1154 CHECK_NUMBER (ch);
1155 c = XINT (ch);
1156 if (c < 0 || c >= 0400)
1157 error ("Invalid unibyte character: %d", c);
1158 c = unibyte_char_to_multibyte (c);
1159 if (c < 0)
1160 error ("Can't convert to multibyte character: %d", XINT (ch));
1161 return make_number (c);
1164 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1165 Smultibyte_char_to_unibyte, 1, 1, 0,
1166 doc: /* Convert the multibyte character CH to unibyte character.
1167 The conversion is done based on `nonascii-translation-table' (which see)
1168 or `nonascii-insert-offset' (which see). */)
1169 (ch)
1170 Lisp_Object ch;
1172 int c;
1174 CHECK_NUMBER (ch);
1175 c = XINT (ch);
1176 if (! CHAR_VALID_P (c, 0))
1177 error ("Invalid multibyte character: %d", c);
1178 c = multibyte_char_to_unibyte (c, Qnil);
1179 if (c < 0)
1180 error ("Can't convert to unibyte character: %d", XINT (ch));
1181 return make_number (c);
1184 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1185 doc: /* Return 1 regardless of the argument CH. */)
1186 (ch)
1187 Lisp_Object ch;
1189 CHECK_NUMBER (ch);
1190 return make_number (1);
1193 /* Return how many bytes C will occupy in a multibyte buffer.
1194 Don't call this function directly, instead use macro CHAR_BYTES. */
1196 char_bytes (c)
1197 int c;
1199 int charset;
1201 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1202 return 1;
1203 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1204 return 1;
1206 charset = CHAR_CHARSET (c);
1207 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1210 /* Return the width of character of which multi-byte form starts with
1211 C. The width is measured by how many columns occupied on the
1212 screen when displayed in the current buffer. */
1214 #define ONE_BYTE_CHAR_WIDTH(c) \
1215 (c < 0x20 \
1216 ? (c == '\t' \
1217 ? XFASTINT (current_buffer->tab_width) \
1218 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1219 : (c < 0x7f \
1220 ? 1 \
1221 : (c == 0x7F \
1222 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1223 : ((! NILP (current_buffer->enable_multibyte_characters) \
1224 && BASE_LEADING_CODE_P (c)) \
1225 ? WIDTH_BY_CHAR_HEAD (c) \
1226 : 4))))
1228 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1229 doc: /* Return width of CH when displayed in the current buffer.
1230 The width is measured by how many columns it occupies on the screen.
1231 Tab is taken to occupy `tab-width' columns. */)
1232 (ch)
1233 Lisp_Object ch;
1235 Lisp_Object val, disp;
1236 int c;
1237 struct Lisp_Char_Table *dp = buffer_display_table ();
1239 CHECK_NUMBER (ch);
1241 c = XINT (ch);
1243 /* Get the way the display table would display it. */
1244 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1246 if (VECTORP (disp))
1247 XSETINT (val, XVECTOR (disp)->size);
1248 else if (SINGLE_BYTE_CHAR_P (c))
1249 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1250 else
1252 int charset = CHAR_CHARSET (c);
1254 XSETFASTINT (val, CHARSET_WIDTH (charset));
1256 return val;
1259 /* Return width of string STR of length LEN when displayed in the
1260 current buffer. The width is measured by how many columns it
1261 occupies on the screen. */
1264 strwidth (str, len)
1265 unsigned char *str;
1266 int len;
1268 return c_string_width (str, len, -1, NULL, NULL);
1271 /* Return width of string STR of length LEN when displayed in the
1272 current buffer. The width is measured by how many columns it
1273 occupies on the screen. If PRECISION > 0, return the width of
1274 longest substring that doesn't exceed PRECISION, and set number of
1275 characters and bytes of the substring in *NCHARS and *NBYTES
1276 respectively. */
1279 c_string_width (str, len, precision, nchars, nbytes)
1280 const unsigned char *str;
1281 int len, precision, *nchars, *nbytes;
1283 int i = 0, i_byte = 0;
1284 int width = 0;
1285 int chars;
1286 struct Lisp_Char_Table *dp = buffer_display_table ();
1288 while (i_byte < len)
1290 int bytes, thiswidth;
1291 Lisp_Object val;
1293 if (dp)
1295 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1297 chars = 1;
1298 val = DISP_CHAR_VECTOR (dp, c);
1299 if (VECTORP (val))
1300 thiswidth = XVECTOR (val)->size;
1301 else
1302 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1304 else
1306 chars = 1;
1307 PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes);
1308 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1311 if (precision > 0
1312 && (width + thiswidth > precision))
1314 *nchars = i;
1315 *nbytes = i_byte;
1316 return width;
1318 i++;
1319 i_byte += bytes;
1320 width += thiswidth;
1323 if (precision > 0)
1325 *nchars = i;
1326 *nbytes = i_byte;
1329 return width;
1332 /* Return width of Lisp string STRING when displayed in the current
1333 buffer. The width is measured by how many columns it occupies on
1334 the screen while paying attention to compositions. If PRECISION >
1335 0, return the width of longest substring that doesn't exceed
1336 PRECISION, and set number of characters and bytes of the substring
1337 in *NCHARS and *NBYTES respectively. */
1340 lisp_string_width (string, precision, nchars, nbytes)
1341 Lisp_Object string;
1342 int precision, *nchars, *nbytes;
1344 int len = SCHARS (string);
1345 int len_byte = SBYTES (string);
1346 const unsigned char *str = SDATA (string);
1347 int i = 0, i_byte = 0;
1348 int width = 0;
1349 struct Lisp_Char_Table *dp = buffer_display_table ();
1351 while (i < len)
1353 int chars, bytes, thiswidth;
1354 Lisp_Object val;
1355 int cmp_id;
1356 int ignore, end;
1358 if (find_composition (i, -1, &ignore, &end, &val, string)
1359 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
1360 >= 0))
1362 thiswidth = composition_table[cmp_id]->width;
1363 chars = end - i;
1364 bytes = string_char_to_byte (string, end) - i_byte;
1366 else if (dp)
1368 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1370 chars = 1;
1371 val = DISP_CHAR_VECTOR (dp, c);
1372 if (VECTORP (val))
1373 thiswidth = XVECTOR (val)->size;
1374 else
1375 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1377 else
1379 chars = 1;
1380 PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes);
1381 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1384 if (precision > 0
1385 && (width + thiswidth > precision))
1387 *nchars = i;
1388 *nbytes = i_byte;
1389 return width;
1391 i += chars;
1392 i_byte += bytes;
1393 width += thiswidth;
1396 if (precision > 0)
1398 *nchars = i;
1399 *nbytes = i_byte;
1402 return width;
1405 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1406 doc: /* Return width of STRING when displayed in the current buffer.
1407 Width is measured by how many columns it occupies on the screen.
1408 When calculating width of a multibyte character in STRING,
1409 only the base leading-code is considered; the validity of
1410 the following bytes is not checked. Tabs in STRING are always
1411 taken to occupy `tab-width' columns. */)
1412 (string)
1413 Lisp_Object string;
1415 Lisp_Object val;
1417 CHECK_STRING (string);
1418 XSETFASTINT (val, lisp_string_width (string, -1, NULL, NULL));
1419 return val;
1422 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1423 doc: /* Return the direction of CH.
1424 The returned value is 0 for left-to-right and 1 for right-to-left. */)
1425 (ch)
1426 Lisp_Object ch;
1428 int charset;
1430 CHECK_NUMBER (ch);
1431 charset = CHAR_CHARSET (XFASTINT (ch));
1432 if (!CHARSET_DEFINED_P (charset))
1433 invalid_character (XINT (ch));
1434 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1437 /* Return the number of characters in the NBYTES bytes at PTR.
1438 This works by looking at the contents and checking for multibyte sequences.
1439 However, if the current buffer has enable-multibyte-characters = nil,
1440 we treat each byte as a character. */
1443 chars_in_text (ptr, nbytes)
1444 const unsigned char *ptr;
1445 int nbytes;
1447 /* current_buffer is null at early stages of Emacs initialization. */
1448 if (current_buffer == 0
1449 || NILP (current_buffer->enable_multibyte_characters))
1450 return nbytes;
1452 return multibyte_chars_in_text (ptr, nbytes);
1455 /* Return the number of characters in the NBYTES bytes at PTR.
1456 This works by looking at the contents and checking for multibyte sequences.
1457 It ignores enable-multibyte-characters. */
1460 multibyte_chars_in_text (ptr, nbytes)
1461 const unsigned char *ptr;
1462 int nbytes;
1464 const unsigned char *endp;
1465 int chars, bytes;
1467 endp = ptr + nbytes;
1468 chars = 0;
1470 while (ptr < endp)
1472 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1473 ptr += bytes;
1474 chars++;
1477 return chars;
1480 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1481 count the numbers of characters and bytes in it. On counting
1482 bytes, pay attention to the fact that 8-bit characters in the range
1483 0x80..0x9F are represented by 2 bytes in multibyte text. */
1484 void
1485 parse_str_as_multibyte (str, len, nchars, nbytes)
1486 const unsigned char *str;
1487 int len, *nchars, *nbytes;
1489 const unsigned char *endp = str + len;
1490 int n, chars = 0, bytes = 0;
1492 while (str < endp)
1494 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1495 str += n, bytes += n;
1496 else
1497 str++, bytes += 2;
1498 chars++;
1500 *nchars = chars;
1501 *nbytes = bytes;
1502 return;
1505 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1506 It actually converts only 8-bit characters in the range 0x80..0x9F
1507 that don't contruct multibyte characters to multibyte forms. If
1508 NCHARS is nonzero, set *NCHARS to the number of characters in the
1509 text. It is assured that we can use LEN bytes at STR as a work
1510 area and that is enough. Return the number of bytes of the
1511 resulting text. */
1514 str_as_multibyte (str, len, nbytes, nchars)
1515 unsigned char *str;
1516 int len, nbytes, *nchars;
1518 unsigned char *p = str, *endp = str + nbytes;
1519 unsigned char *to;
1520 int chars = 0;
1521 int n;
1523 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1524 p += n, chars++;
1525 if (nchars)
1526 *nchars = chars;
1527 if (p == endp)
1528 return nbytes;
1530 to = p;
1531 nbytes = endp - p;
1532 endp = str + len;
1533 safe_bcopy (p, endp - nbytes, nbytes);
1534 p = endp - nbytes;
1535 while (p < endp)
1537 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1539 while (n--)
1540 *to++ = *p++;
1542 else
1544 *to++ = LEADING_CODE_8_BIT_CONTROL;
1545 *to++ = *p++ + 0x20;
1547 chars++;
1549 if (nchars)
1550 *nchars = chars;
1551 return (to - str);
1554 /* Parse unibyte string at STR of LEN bytes, and return the number of
1555 bytes it may ocupy when converted to multibyte string by
1556 `str_to_multibyte'. */
1559 parse_str_to_multibyte (str, len)
1560 unsigned char *str;
1561 int len;
1563 unsigned char *endp = str + len;
1564 int bytes;
1566 for (bytes = 0; str < endp; str++)
1567 bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2;
1568 return bytes;
1571 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1572 that contains the same single-byte characters. It actually
1573 converts all 8-bit characters to multibyte forms. It is assured
1574 that we can use LEN bytes at STR as a work area and that is
1575 enough. */
1578 str_to_multibyte (str, len, bytes)
1579 unsigned char *str;
1580 int len, bytes;
1582 unsigned char *p = str, *endp = str + bytes;
1583 unsigned char *to;
1585 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1586 if (p == endp)
1587 return bytes;
1588 to = p;
1589 bytes = endp - p;
1590 endp = str + len;
1591 safe_bcopy (p, endp - bytes, bytes);
1592 p = endp - bytes;
1593 while (p < endp)
1595 if (*p < 0x80 || *p >= 0xA0)
1596 *to++ = *p++;
1597 else
1598 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1600 return (to - str);
1603 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1604 actually converts only 8-bit characters in the range 0x80..0x9F to
1605 unibyte forms. */
1608 str_as_unibyte (str, bytes)
1609 unsigned char *str;
1610 int bytes;
1612 unsigned char *p = str, *endp = str + bytes;
1613 unsigned char *to = str;
1615 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1616 to = p;
1617 while (p < endp)
1619 if (*p == LEADING_CODE_8_BIT_CONTROL)
1620 *to++ = *(p + 1) - 0x20, p += 2;
1621 else
1622 *to++ = *p++;
1624 return (to - str);
1628 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
1629 doc: /* Concatenate all the argument characters and make the result a string.
1630 usage: (string &rest CHARACTERS) */)
1631 (n, args)
1632 int n;
1633 Lisp_Object *args;
1635 int i, bufsize;
1636 unsigned char *buf, *p;
1637 int c;
1638 int multibyte = 0;
1639 Lisp_Object ret;
1640 USE_SAFE_ALLOCA;
1642 bufsize = MAX_MULTIBYTE_LENGTH * n;
1643 SAFE_ALLOCA (buf, unsigned char *, bufsize);
1644 p = buf;
1646 for (i = 0; i < n; i++)
1648 CHECK_NUMBER (args[i]);
1649 if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i])))
1650 multibyte = 1;
1653 for (i = 0; i < n; i++)
1655 c = XINT (args[i]);
1656 if (multibyte)
1657 p += CHAR_STRING (c, p);
1658 else
1659 *p++ = c;
1662 ret = make_string_from_bytes (buf, n, p - buf);
1663 SAFE_FREE ();
1665 return ret;
1668 #endif /* emacs */
1671 charset_id_internal (charset_name)
1672 char *charset_name;
1674 Lisp_Object val;
1676 val= Fget (intern (charset_name), Qcharset);
1677 if (!VECTORP (val))
1678 error ("Charset %s is not defined", charset_name);
1680 return (XINT (XVECTOR (val)->contents[0]));
1683 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1684 Ssetup_special_charsets, 0, 0, 0, doc: /* Internal use only. */)
1687 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1688 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1689 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1690 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1691 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1692 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1693 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1694 charset_mule_unicode_0100_24ff
1695 = charset_id_internal ("mule-unicode-0100-24ff");
1696 charset_mule_unicode_2500_33ff
1697 = charset_id_internal ("mule-unicode-2500-33ff");
1698 charset_mule_unicode_e000_ffff
1699 = charset_id_internal ("mule-unicode-e000-ffff");
1700 return Qnil;
1703 void
1704 init_charset_once ()
1706 int i, j, k;
1708 staticpro (&Vcharset_table);
1709 staticpro (&Vcharset_symbol_table);
1710 staticpro (&Vgeneric_character_list);
1712 /* This has to be done here, before we call Fmake_char_table. */
1713 Qcharset_table = intern ("charset-table");
1714 staticpro (&Qcharset_table);
1716 /* Intern this now in case it isn't already done.
1717 Setting this variable twice is harmless.
1718 But don't staticpro it here--that is done in alloc.c. */
1719 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1721 /* Now we are ready to set up this property, so we can
1722 create the charset table. */
1723 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1724 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1726 Qunknown = intern ("unknown");
1727 staticpro (&Qunknown);
1728 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1729 Qunknown);
1731 /* Setup tables. */
1732 for (i = 0; i < 2; i++)
1733 for (j = 0; j < 2; j++)
1734 for (k = 0; k < 128; k++)
1735 iso_charset_table [i][j][k] = -1;
1737 for (i = 0; i < 256; i++)
1738 bytes_by_char_head[i] = 1;
1739 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1740 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1741 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1742 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1744 for (i = 0; i < 128; i++)
1745 width_by_char_head[i] = 1;
1746 for (; i < 256; i++)
1747 width_by_char_head[i] = 4;
1748 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1749 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1750 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1751 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1754 Lisp_Object val;
1756 val = Qnil;
1757 for (i = 0x81; i < 0x90; i++)
1758 val = Fcons (make_number ((i - 0x70) << 7), val);
1759 for (; i < 0x9A; i++)
1760 val = Fcons (make_number ((i - 0x8F) << 14), val);
1761 for (i = 0xA0; i < 0xF0; i++)
1762 val = Fcons (make_number ((i - 0x70) << 7), val);
1763 for (; i < 0xFF; i++)
1764 val = Fcons (make_number ((i - 0xE0) << 14), val);
1765 Vgeneric_character_list = Fnreverse (val);
1768 nonascii_insert_offset = 0;
1769 Vnonascii_translation_table = Qnil;
1772 #ifdef emacs
1774 void
1775 syms_of_charset ()
1777 Qcharset = intern ("charset");
1778 staticpro (&Qcharset);
1780 Qascii = intern ("ascii");
1781 staticpro (&Qascii);
1783 Qeight_bit_control = intern ("eight-bit-control");
1784 staticpro (&Qeight_bit_control);
1786 Qeight_bit_graphic = intern ("eight-bit-graphic");
1787 staticpro (&Qeight_bit_graphic);
1789 /* Define special charsets ascii, eight-bit-control, and
1790 eight-bit-graphic. */
1791 update_charset_table (make_number (CHARSET_ASCII),
1792 make_number (1), make_number (94),
1793 make_number (1),
1794 make_number (0),
1795 make_number ('B'),
1796 make_number (0),
1797 build_string ("ASCII"),
1798 Qnil, /* same as above */
1799 build_string ("ASCII (ISO646 IRV)"));
1800 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1801 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1803 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1804 make_number (1), make_number (96),
1805 make_number (4),
1806 make_number (0),
1807 make_number (-1),
1808 make_number (-1),
1809 build_string ("8-bit control code (0x80..0x9F)"),
1810 Qnil, /* same as above */
1811 Qnil); /* same as above */
1812 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1813 Fput (Qeight_bit_control, Qcharset,
1814 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1816 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1817 make_number (1), make_number (96),
1818 make_number (4),
1819 make_number (0),
1820 make_number (-1),
1821 make_number (-1),
1822 build_string ("8-bit graphic char (0xA0..0xFF)"),
1823 Qnil, /* same as above */
1824 Qnil); /* same as above */
1825 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1826 Fput (Qeight_bit_graphic, Qcharset,
1827 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1829 Qauto_fill_chars = intern ("auto-fill-chars");
1830 staticpro (&Qauto_fill_chars);
1831 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1833 defsubr (&Sdefine_charset);
1834 defsubr (&Sgeneric_character_list);
1835 defsubr (&Sget_unused_iso_final_char);
1836 defsubr (&Sdeclare_equiv_charset);
1837 defsubr (&Sfind_charset_region);
1838 defsubr (&Sfind_charset_string);
1839 defsubr (&Smake_char_internal);
1840 defsubr (&Ssplit_char);
1841 defsubr (&Schar_charset);
1842 defsubr (&Scharset_after);
1843 defsubr (&Siso_charset);
1844 defsubr (&Schar_valid_p);
1845 defsubr (&Sunibyte_char_to_multibyte);
1846 defsubr (&Smultibyte_char_to_unibyte);
1847 defsubr (&Schar_bytes);
1848 defsubr (&Schar_width);
1849 defsubr (&Sstring_width);
1850 defsubr (&Schar_direction);
1851 defsubr (&Sstring);
1852 defsubr (&Ssetup_special_charsets);
1854 DEFVAR_LISP ("charset-list", &Vcharset_list,
1855 doc: /* List of charsets ever defined. */);
1856 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1857 Fcons (Qeight_bit_graphic, Qnil)));
1859 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1860 doc: /* Vector of cons cell of a symbol and translation table ever defined.
1861 An ID of a translation table is an index of this vector. */);
1862 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1864 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1865 doc: /* Leading-code of private TYPE9N charset of column-width 1. */);
1866 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1868 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1869 doc: /* Leading-code of private TYPE9N charset of column-width 2. */);
1870 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1872 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1873 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */);
1874 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1876 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1877 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */);
1878 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1880 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1881 doc: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1882 This is used for converting unibyte text to multibyte,
1883 and for inserting character codes specified by number.
1885 This serves to convert a Latin-1 or similar 8-bit character code
1886 to the corresponding Emacs multibyte character code.
1887 Typically the value should be (- (make-char CHARSET 0) 128),
1888 for your choice of character set.
1889 If `nonascii-translation-table' is non-nil, it overrides this variable. */);
1890 nonascii_insert_offset = 0;
1892 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1893 doc: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1894 This is used for converting unibyte text to multibyte,
1895 and for inserting character codes specified by number.
1897 Conversion is performed only when multibyte characters are enabled,
1898 and it serves to convert a Latin-1 or similar 8-bit character code
1899 to the corresponding Emacs character code.
1901 If this is nil, `nonascii-insert-offset' is used instead.
1902 See also the docstring of `make-translation-table'. */);
1903 Vnonascii_translation_table = Qnil;
1905 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1906 doc: /* A char-table for characters which invoke auto-filling.
1907 Such characters have value t in this table. */);
1908 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1909 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1910 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1913 #endif /* emacs */
1915 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1916 (do not change this comment) */