1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
78 int _fetch_multibyte_char_len
;
83 char_string_with_unification (c
, p
)
91 if (c
<= MAX_3_BYTE_CHAR
|| c
> MAX_5_BYTE_CHAR
)
93 bytes
= CHAR_STRING (c
, p
);
95 else if (c
<= MAX_4_BYTE_CHAR
)
97 p
[0] = (0xF0 | (c
>> 18));
98 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
99 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
100 p
[3] = (0x80 | (c
& 0x3F));
106 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
107 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
108 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
109 p
[4] = (0x80 | (c
& 0x3F));
118 string_char_with_unification (p
, advanced
, len
)
119 unsigned char *p
, **advanced
;
123 unsigned char *saved_p
= p
;
125 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
127 c
= STRING_CHAR_ADVANCE (p
);
129 else if (! (*p
& 0x08))
131 c
= ((((p
)[0] & 0xF) << 18)
132 | (((p
)[1] & 0x3F) << 12)
133 | (((p
)[2] & 0x3F) << 6)
139 c
= ((((p
)[1] & 0x3F) << 18)
140 | (((p
)[2] & 0x3F) << 12)
141 | (((p
)[3] & 0x3F) << 6)
146 MAYBE_UNIFY_CHAR (c
);
156 /* Translate character C by translation table TABLE. If C is
157 negative, translate a character specified by CHARSET and CODE. If
158 no translation is found in TABLE, return the untranslated
162 translate_char (table
, c
)
168 if (! CHAR_TABLE_P (table
))
170 ch
= CHAR_TABLE_REF (table
, c
);
171 if (! CHARACTERP (ch
))
176 /* Convert the unibyte character C to the corresponding multibyte
177 character based on the current value of charset_primary. If C
178 can't be converted, return C. */
181 unibyte_char_to_multibyte (c
)
184 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
185 int c1
= DECODE_CHAR (charset
, c
);
187 return ((c1
>= 0) ? c1
: c
);
191 /* Convert the multibyte character C to unibyte 8-bit character based
192 on the current value of charset_primary. If dimension of
193 charset_primary is more than one, return (C & 0xFF).
195 The argument REV_TBL is now ignored. It will be removed in the
199 multibyte_char_to_unibyte (c
, rev_tbl
)
203 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
204 unsigned c1
= ENCODE_CHAR (charset
, c
);
206 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
210 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
211 doc
: /* Return non-nil if OBJECT is a character. */)
213 Lisp_Object object
, ignore
;
215 return (CHARACTERP (object
) ? Qt
: Qnil
);
218 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
219 doc
: /* Return the character of the maximum code. */)
222 return make_number (MAX_CHAR
);
225 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
226 Sunibyte_char_to_multibyte
, 1, 1, 0,
227 doc
: /* Convert the unibyte character CH to multibyte character.
228 The multibyte character is a result of decoding CH by
229 the current primary charset (value of `charset-primary'). */)
234 struct charset
*charset
;
236 CHECK_CHARACTER (ch
);
239 error ("Invalid unibyte character: %d", c
);
240 charset
= CHARSET_FROM_ID (charset_primary
);
241 c
= DECODE_CHAR (charset
, c
);
243 error ("Can't convert to multibyte character: %d", XINT (ch
));
244 return make_number (c
);
247 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
248 Smultibyte_char_to_unibyte
, 1, 1, 0,
249 doc
: /* Convert the multibyte character CH to unibyte character.\n\
250 The unibyte character is a result of encoding CH by
251 the current primary charset (value of `charset-primary'). */)
257 struct charset
*charset
;
259 CHECK_CHARACTER (ch
);
261 charset
= CHARSET_FROM_ID (charset_primary
);
262 code
= ENCODE_CHAR (charset
, c
);
263 if (code
< CHARSET_MIN_CODE (charset
)
264 || code
> CHARSET_MAX_CODE (charset
))
265 error ("Can't convert to unibyte character: %d", XINT (ch
));
266 return make_number (code
);
269 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
270 doc
: /* Return 1 regardless of the argument CHAR.
271 This is now an obsolete function. We keep it just for backward compatibility. */)
275 CHECK_CHARACTER (ch
);
276 return make_number (1);
279 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
280 doc
: /* Return width of CHAR when displayed in the current buffer.
281 The width is measured by how many columns it occupies on the screen.
282 Tab is taken to occupy `tab-width' columns. */)
288 struct Lisp_Char_Table
*dp
= buffer_display_table ();
290 CHECK_CHARACTER (ch
);
293 /* Get the way the display table would display it. */
294 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
297 width
= ASIZE (disp
);
299 width
= CHAR_WIDTH (c
);
301 return make_number (width
);
304 /* Return width of string STR of length LEN when displayed in the
305 current buffer. The width is measured by how many columns it
306 occupies on the screen. If PRECISION > 0, return the width of
307 longest substring that doesn't exceed PRECISION, and set number of
308 characters and bytes of the substring in *NCHARS and *NBYTES
312 c_string_width (str
, len
, precision
, nchars
, nbytes
)
314 int precision
, *nchars
, *nbytes
;
316 int i
= 0, i_byte
= 0;
318 struct Lisp_Char_Table
*dp
= buffer_display_table ();
322 int bytes
, thiswidth
;
324 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
328 val
= DISP_CHAR_VECTOR (dp
, c
);
330 thiswidth
= XVECTOR (val
)->size
;
332 thiswidth
= CHAR_WIDTH (c
);
336 thiswidth
= CHAR_WIDTH (c
);
340 && (width
+ thiswidth
> precision
))
360 /* Return width of string STR of length LEN when displayed in the
361 current buffer. The width is measured by how many columns it
362 occupies on the screen. */
369 return c_string_width (str
, len
, -1, NULL
, NULL
);
372 /* Return width of Lisp string STRING when displayed in the current
373 buffer. The width is measured by how many columns it occupies on
374 the screen while paying attention to compositions. If PRECISION >
375 0, return the width of longest substring that doesn't exceed
376 PRECISION, and set number of characters and bytes of the substring
377 in *NCHARS and *NBYTES respectively. */
380 lisp_string_width (string
, precision
, nchars
, nbytes
)
382 int precision
, *nchars
, *nbytes
;
384 int len
= XSTRING (string
)->size
;
385 unsigned char *str
= XSTRING (string
)->data
;
386 int i
= 0, i_byte
= 0;
388 struct Lisp_Char_Table
*dp
= buffer_display_table ();
392 int chars
, bytes
, thiswidth
;
397 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
398 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
401 thiswidth
= composition_table
[cmp_id
]->width
;
403 bytes
= string_char_to_byte (string
, end
) - i_byte
;
407 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
410 val
= DISP_CHAR_VECTOR (dp
, c
);
412 thiswidth
= XVECTOR (val
)->size
;
414 thiswidth
= CHAR_WIDTH (c
);
418 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
421 thiswidth
= CHAR_WIDTH (c
);
425 && (width
+ thiswidth
> precision
))
445 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
446 doc
: /* Return width of STRING when displayed in the current buffer.
447 Width is measured by how many columns it occupies on the screen.
448 When calculating width of a multibyte character in STRING,
449 only the base leading-code is considered; the validity of
450 the following bytes is not checked. Tabs in STRING are always
451 taken to occupy `tab-width' columns. */)
458 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
462 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
463 doc
: /* Return the direction of CHAR.
464 The returned value is 0 for left-to-right and 1 for right-to-left. */)
470 CHECK_CHARACTER (ch
);
472 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
475 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
476 doc
: /* Return number of characters between BEG and END.
477 This is now an obsolete function. We keep it just for backward compatibility. */)
479 Lisp_Object beg
, end
;
483 CHECK_NUMBER_COERCE_MARKER (beg
);
484 CHECK_NUMBER_COERCE_MARKER (end
);
486 from
= min (XFASTINT (beg
), XFASTINT (end
));
487 to
= max (XFASTINT (beg
), XFASTINT (end
));
489 return make_number (to
- from
);
492 /* Return the number of characters in the NBYTES bytes at PTR.
493 This works by looking at the contents and checking for multibyte
494 sequences while assuming that there's no invalid sequence.
495 However, if the current buffer has enable-multibyte-characters =
496 nil, we treat each byte as a character. */
499 chars_in_text (ptr
, nbytes
)
503 /* current_buffer is null at early stages of Emacs initialization. */
504 if (current_buffer
== 0
505 || NILP (current_buffer
->enable_multibyte_characters
))
508 return multibyte_chars_in_text (ptr
, nbytes
);
511 /* Return the number of characters in the NBYTES bytes at PTR.
512 This works by looking at the contents and checking for multibyte
513 sequences while assuming that there's no invalid sequence. It
514 ignores enable-multibyte-characters. */
517 multibyte_chars_in_text (ptr
, nbytes
)
521 unsigned char *endp
= ptr
+ nbytes
;
526 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
537 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
538 characters and bytes in it, and store them in *NCHARS and *NBYTES
539 respectively. On counting bytes, pay attention to that 8-bit
540 characters not constructing a valid multibyte sequence are
541 represented by 2-byte in a multibyte text. */
544 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
546 int len
, *nchars
, *nbytes
;
548 unsigned char *endp
= str
+ len
;
549 int n
, chars
= 0, bytes
= 0;
551 if (len
>= MAX_MULTIBYTE_LENGTH
)
553 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
554 while (str
< adjusted_endp
)
556 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
557 str
+= n
, bytes
+= n
;
565 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
566 str
+= n
, bytes
+= n
;
577 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
578 It actually converts only such 8-bit characters that don't contruct
579 a multibyte sequence to multibyte forms of Latin-1 characters. If
580 NCHARS is nonzero, set *NCHARS to the number of characters in the
581 text. It is assured that we can use LEN bytes at STR as a work
582 area and that is enough. Return the number of bytes of the
586 str_as_multibyte (str
, len
, nbytes
, nchars
)
588 int len
, nbytes
, *nchars
;
590 unsigned char *p
= str
, *endp
= str
+ nbytes
;
595 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
597 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
598 while (p
< adjusted_endp
599 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
602 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
612 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
615 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
617 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
618 while (p
< adjusted_endp
)
620 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
628 c
= BYTE8_TO_CHAR (c
);
629 to
+= CHAR_STRING (c
, to
);
636 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
644 c
= BYTE8_TO_CHAR (c
);
645 to
+= CHAR_STRING (c
, to
);
654 /* Parse unibyte string at STR of LEN bytes, and return the number of
655 bytes it may ocupy when converted to multibyte string by
656 `str_to_multibyte'. */
659 parse_str_to_multibyte (str
, len
)
663 unsigned char *endp
= str
+ len
;
666 for (bytes
= 0; str
< endp
; str
++)
667 bytes
+= (*str
< 0x80) ? 1 : 2;
672 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
673 that contains the same single-byte characters. It actually
674 converts all 8-bit characters to multibyte forms. It is assured
675 that we can use LEN bytes at STR as a work area and that is
679 str_to_multibyte (str
, len
, bytes
)
683 unsigned char *p
= str
, *endp
= str
+ bytes
;
686 while (p
< endp
&& *p
< 0x80) p
++;
692 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
699 c
= BYTE8_TO_CHAR (c
);
700 to
+= CHAR_STRING (c
, to
);
705 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
706 actually converts characters in the range 0x80..0xFF to
710 str_as_unibyte (str
, bytes
)
714 unsigned char *p
= str
, *endp
= str
+ bytes
;
715 unsigned char *to
= str
;
721 len
= BYTES_BY_CHAR_HEAD (c
);
722 if (CHAR_BYTE8_HEAD_P (c
))
730 len
= BYTES_BY_CHAR_HEAD (c
);
731 if (CHAR_BYTE8_HEAD_P (c
))
733 c
= STRING_CHAR_ADVANCE (p
);
734 *to
++ = CHAR_TO_BYTE8 (c
);
738 while (len
--) *to
++ = *p
++;
745 string_count_byte8 (string
)
748 int multibyte
= STRING_MULTIBYTE (string
);
749 int nbytes
= STRING_BYTES (XSTRING (string
));
750 unsigned char *p
= XSTRING (string
)->data
;
751 unsigned char *pend
= p
+ nbytes
;
759 len
= BYTES_BY_CHAR_HEAD (c
);
761 if (CHAR_BYTE8_HEAD_P (c
))
776 string_escape_byte8 (string
)
779 int nchars
= XSTRING (string
)->size
;
780 int nbytes
= STRING_BYTES (XSTRING (string
));
781 int multibyte
= STRING_MULTIBYTE (string
);
783 unsigned char *src
, *src_end
, *dst
;
787 if (multibyte
&& nchars
== nbytes
)
790 byte8_count
= string_count_byte8 (string
);
792 if (byte8_count
== 0)
796 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
797 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
798 nbytes
+ byte8_count
* 2);
800 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
801 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
803 src
= XSTRING (string
)->data
;
804 src_end
= src
+ nbytes
;
805 dst
= XSTRING (val
)->data
;
807 while (src
< src_end
)
810 len
= BYTES_BY_CHAR_HEAD (c
);
812 if (CHAR_BYTE8_HEAD_P (c
))
814 c
= STRING_CHAR_ADVANCE (src
);
815 c
= CHAR_TO_BYTE8 (c
);
816 sprintf ((char *) dst
, "\\%03o", c
);
820 while (len
--) *dst
++ = *src
++;
823 while (src
< src_end
)
828 sprintf ((char *) dst
, "\\%03o", c
);
838 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
840 Concatenate all the argument characters and make the result a string.
841 usage: (string &rest CHARACTERS) */)
847 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
848 unsigned char *p
= buf
;
851 for (i
= 0; i
< n
; i
++)
853 CHECK_CHARACTER (args
[i
]);
855 p
+= CHAR_STRING (c
, p
);
858 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
862 init_character_once ()
871 DEFSYM (Qcharacterp
, "characterp");
872 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
874 staticpro (&Vchar_unify_table
);
875 Vchar_unify_table
= Qnil
;
877 defsubr (&Smax_char
);
878 defsubr (&Scharacterp
);
879 defsubr (&Sunibyte_char_to_multibyte
);
880 defsubr (&Smultibyte_char_to_unibyte
);
881 defsubr (&Schar_bytes
);
882 defsubr (&Schar_width
);
883 defsubr (&Sstring_width
);
884 defsubr (&Schar_direction
);
885 defsubr (&Schars_in_region
);
888 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
890 Vector of cons cell of a symbol and translation table ever defined.
891 An ID of a translation table is an index of this vector. */);
892 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
894 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
896 A char-table for characters which invoke auto-filling.
897 Such characters have value t in this table. */);
898 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
899 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
900 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);
902 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
904 A char-table for width (columns) of each character. */);
905 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
907 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
908 doc
: /* A char-table for direction of each character. */);
909 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
911 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
912 doc
: /* A char-table for each printable character. */);
913 Vprintable_chars
= Fmake_char_table (Qnil
, Qt
);