* emacs.texi (Top): Remove Kill Errors from menu.
[emacs.git] / src / character.c
blob7c0f38f96a5180e1901659829a95261824b4f802
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
10 This file is part of GNU Emacs.
12 GNU Emacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25 /* At first, see the document in `character.h' to understand the code
26 in this file. */
28 #ifdef emacs
29 #include <config.h>
30 #endif
32 #include <stdio.h>
34 #ifdef emacs
36 #include <sys/types.h>
37 #include "lisp.h"
38 #include "character.h"
39 #include "buffer.h"
40 #include "charset.h"
41 #include "composite.h"
42 #include "disptab.h"
44 #else /* not emacs */
46 #include "mulelib.h"
48 #endif /* emacs */
50 Lisp_Object Qcharacterp;
52 /* Vector of translation table ever defined.
53 ID of a translation table is used to index this vector. */
54 Lisp_Object Vtranslation_table_vector;
56 /* A char-table for characters which may invoke auto-filling. */
57 Lisp_Object Vauto_fill_chars;
59 Lisp_Object Qauto_fill_chars;
61 /* Char-table of information about which character to unify to which
62 Unicode character. */
63 Lisp_Object Vchar_unify_table;
65 /* A char-table. An element is non-nil iff the corresponding
66 character has a printable glyph. */
67 Lisp_Object Vprintable_chars;
69 /* A char-table. An elemnent is a column-width of the corresponding
70 character. */
71 Lisp_Object Vchar_width_table;
73 /* A char-table. An element is a symbol indicating the direction
74 property of corresponding character. */
75 Lisp_Object Vchar_direction_table;
77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
78 unsigned char *_fetch_multibyte_char_p;
80 /* Char table of scripts. */
81 Lisp_Object Vchar_script_table;
83 /* Alist of scripts vs representative characters. */
84 Lisp_Object Vscript_representative_chars;
86 static Lisp_Object Qchar_script_table;
88 Lisp_Object Vunicode_category_table;
90 /* Mapping table from unibyte chars to multibyte chars. */
91 int unibyte_to_multibyte_table[256];
93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
94 char. */
95 char unibyte_has_multibyte_table[256];
99 /* If character code C has modifier masks, reflect them to the
100 character code if possible. Return the resulting code. */
103 char_resolve_modifier_mask (c)
104 int c;
106 /* A non-ASCII character can't reflect modifier bits to the code. */
107 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
108 return c;
110 /* For Meta, Shift, and Control modifiers, we need special care. */
111 if (c & CHAR_SHIFT)
113 /* Shift modifier is valid only with [A-Za-z]. */
114 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
115 c &= ~CHAR_SHIFT;
116 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
117 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
118 /* Shift modifier for control characters and SPC is ignored. */
119 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
120 c &= ~CHAR_SHIFT;
122 if (c & CHAR_CTL)
124 /* Simulate the code in lread.c. */
125 /* Allow `\C- ' and `\C-?'. */
126 if ((c & 0377) == ' ')
127 c &= ~0177 & ~ CHAR_CTL;
128 else if ((c & 0377) == '?')
129 c = 0177 | (c & ~0177 & ~CHAR_CTL);
130 /* ASCII control chars are made from letters (both cases),
131 as well as the non-letters within 0100...0137. */
132 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
133 c &= (037 | (~0177 & ~CHAR_CTL));
134 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
135 c &= (037 | (~0177 & ~CHAR_CTL));
137 if (c & CHAR_META)
139 /* Move the meta bit to the right place for a string. */
140 c = (c & ~CHAR_META) | 0x80;
143 return c;
147 /* Store multibyte form of character C at P. If C has modifier bits,
148 handle them appropriately. */
151 char_string (c, p)
152 unsigned c;
153 unsigned char *p;
155 int bytes;
157 if (c & CHAR_MODIFIER_MASK)
159 c = (unsigned) char_resolve_modifier_mask ((int) c);
160 /* If C still has any modifier bits, just ignore it. */
161 c &= ~CHAR_MODIFIER_MASK;
164 MAYBE_UNIFY_CHAR (c);
166 if (c <= MAX_3_BYTE_CHAR)
168 bytes = CHAR_STRING (c, p);
170 else if (c <= MAX_4_BYTE_CHAR)
172 p[0] = (0xF0 | (c >> 18));
173 p[1] = (0x80 | ((c >> 12) & 0x3F));
174 p[2] = (0x80 | ((c >> 6) & 0x3F));
175 p[3] = (0x80 | (c & 0x3F));
176 bytes = 4;
178 else if (c <= MAX_5_BYTE_CHAR)
180 p[0] = 0xF8;
181 p[1] = (0x80 | ((c >> 18) & 0x0F));
182 p[2] = (0x80 | ((c >> 12) & 0x3F));
183 p[3] = (0x80 | ((c >> 6) & 0x3F));
184 p[4] = (0x80 | (c & 0x3F));
185 bytes = 5;
187 else if (c <= MAX_CHAR)
189 c = CHAR_TO_BYTE8 (c);
190 bytes = BYTE8_STRING (c, p);
192 else
193 error ("Invalid character: %d", c);
195 return bytes;
199 /* Return a character whose multibyte form is at P. Set LEN is not
200 NULL, it must be a pointer to integer. In that case, set *LEN to
201 the byte length of the multibyte form. If ADVANCED is not NULL, is
202 must be a pointer to unsigned char. In that case, set *ADVANCED to
203 the ending address (i.e. the starting address of the next
204 character) of the multibyte form. */
207 string_char (p, advanced, len)
208 const unsigned char *p;
209 const unsigned char **advanced;
210 int *len;
212 int c;
213 const unsigned char *saved_p = p;
215 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
217 c = STRING_CHAR_ADVANCE (p);
219 else if (! (*p & 0x08))
221 c = ((((p)[0] & 0xF) << 18)
222 | (((p)[1] & 0x3F) << 12)
223 | (((p)[2] & 0x3F) << 6)
224 | ((p)[3] & 0x3F));
225 p += 4;
227 else
229 c = ((((p)[1] & 0x3F) << 18)
230 | (((p)[2] & 0x3F) << 12)
231 | (((p)[3] & 0x3F) << 6)
232 | ((p)[4] & 0x3F));
233 p += 5;
236 MAYBE_UNIFY_CHAR (c);
238 if (len)
239 *len = p - saved_p;
240 if (advanced)
241 *advanced = p;
242 return c;
246 /* Translate character C by translation table TABLE. If C is
247 negative, translate a character specified by CHARSET and CODE. If
248 no translation is found in TABLE, return the untranslated
249 character. If TABLE is a list, elements are char tables. In this
250 case, translace C by all tables. */
253 translate_char (table, c)
254 Lisp_Object table;
255 int c;
257 if (CHAR_TABLE_P (table))
259 Lisp_Object ch;
261 ch = CHAR_TABLE_REF (table, c);
262 if (CHARACTERP (ch))
263 c = XINT (ch);
265 else
267 for (; CONSP (table); table = XCDR (table))
268 c = translate_char (XCAR (table), c);
270 return c;
273 /* Convert the multibyte character C to unibyte 8-bit character based
274 on the current value of charset_unibyte. If dimension of
275 charset_unibyte is more than one, return (C & 0xFF).
277 The argument REV_TBL is now ignored. It will be removed in the
278 future. */
281 multibyte_char_to_unibyte (c, rev_tbl)
282 int c;
283 Lisp_Object rev_tbl;
285 struct charset *charset;
286 unsigned c1;
288 if (CHAR_BYTE8_P (c))
289 return CHAR_TO_BYTE8 (c);
290 charset = CHARSET_FROM_ID (charset_unibyte);
291 c1 = ENCODE_CHAR (charset, c);
292 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
295 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
296 by charset_unibyte. */
299 multibyte_char_to_unibyte_safe (c)
300 int c;
302 struct charset *charset;
303 unsigned c1;
305 if (CHAR_BYTE8_P (c))
306 return CHAR_TO_BYTE8 (c);
307 charset = CHARSET_FROM_ID (charset_unibyte);
308 c1 = ENCODE_CHAR (charset, c);
309 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
312 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
313 doc: /* Return non-nil if OBJECT is a character. */)
314 (object, ignore)
315 Lisp_Object object, ignore;
317 return (CHARACTERP (object) ? Qt : Qnil);
320 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
321 doc: /* Return the character of the maximum code. */)
324 return make_number (MAX_CHAR);
327 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
328 Sunibyte_char_to_multibyte, 1, 1, 0,
329 doc: /* Convert the byte CH to multibyte character. */)
330 (ch)
331 Lisp_Object ch;
333 int c;
334 struct charset *charset;
336 CHECK_CHARACTER (ch);
337 c = XFASTINT (ch);
338 if (c >= 0400)
339 error ("Invalid unibyte character: %d", c);
340 charset = CHARSET_FROM_ID (charset_unibyte);
341 c = DECODE_CHAR (charset, c);
342 if (c < 0)
343 c = BYTE8_TO_CHAR (XFASTINT (ch));
344 return make_number (c);
347 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
348 Smultibyte_char_to_unibyte, 1, 1, 0,
349 doc: /* Convert the multibyte character CH to a byte.
350 If the multibyte character does not represent a byte, return -1. */)
351 (ch)
352 Lisp_Object ch;
354 int cm;
356 CHECK_CHARACTER (ch);
357 cm = XFASTINT (ch);
358 if (cm < 256)
359 /* Can't distinguish a byte read from a unibyte buffer from
360 a latin1 char, so let's let it slide. */
361 return ch;
362 else
364 int cu = CHAR_TO_BYTE_SAFE (cm);
365 return make_number (cu);
369 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
370 doc: /* Return 1 regardless of the argument CHAR.
371 This is now an obsolete function. We keep it just for backward compatibility.
372 usage: (char-bytes CHAR) */)
373 (ch)
374 Lisp_Object ch;
376 CHECK_CHARACTER (ch);
377 return make_number (1);
380 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
381 doc: /* Return width of CHAR when displayed in the current buffer.
382 The width is measured by how many columns it occupies on the screen.
383 Tab is taken to occupy `tab-width' columns.
384 usage: (char-width CHAR) */)
385 (ch)
386 Lisp_Object ch;
388 Lisp_Object disp;
389 int c, width;
390 struct Lisp_Char_Table *dp = buffer_display_table ();
392 CHECK_CHARACTER (ch);
393 c = XINT (ch);
395 /* Get the way the display table would display it. */
396 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
398 if (VECTORP (disp))
399 width = ASIZE (disp);
400 else
401 width = CHAR_WIDTH (c);
403 return make_number (width);
406 /* Return width of string STR of length LEN when displayed in the
407 current buffer. The width is measured by how many columns it
408 occupies on the screen. If PRECISION > 0, return the width of
409 longest substring that doesn't exceed PRECISION, and set number of
410 characters and bytes of the substring in *NCHARS and *NBYTES
411 respectively. */
414 c_string_width (str, len, precision, nchars, nbytes)
415 const unsigned char *str;
416 int precision, *nchars, *nbytes;
418 int i = 0, i_byte = 0;
419 int width = 0;
420 struct Lisp_Char_Table *dp = buffer_display_table ();
422 while (i_byte < len)
424 int bytes, thiswidth;
425 Lisp_Object val;
426 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
428 if (dp)
430 val = DISP_CHAR_VECTOR (dp, c);
431 if (VECTORP (val))
432 thiswidth = XVECTOR (val)->size;
433 else
434 thiswidth = CHAR_WIDTH (c);
436 else
438 thiswidth = CHAR_WIDTH (c);
441 if (precision > 0
442 && (width + thiswidth > precision))
444 *nchars = i;
445 *nbytes = i_byte;
446 return width;
448 i++;
449 i_byte += bytes;
450 width += thiswidth;
453 if (precision > 0)
455 *nchars = i;
456 *nbytes = i_byte;
459 return width;
462 /* Return width of string STR of length LEN when displayed in the
463 current buffer. The width is measured by how many columns it
464 occupies on the screen. */
467 strwidth (str, len)
468 unsigned char *str;
469 int len;
471 return c_string_width (str, len, -1, NULL, NULL);
474 /* Return width of Lisp string STRING when displayed in the current
475 buffer. The width is measured by how many columns it occupies on
476 the screen while paying attention to compositions. If PRECISION >
477 0, return the width of longest substring that doesn't exceed
478 PRECISION, and set number of characters and bytes of the substring
479 in *NCHARS and *NBYTES respectively. */
482 lisp_string_width (string, precision, nchars, nbytes)
483 Lisp_Object string;
484 int precision, *nchars, *nbytes;
486 int len = SCHARS (string);
487 /* This set multibyte to 0 even if STRING is multibyte when it
488 contains only ascii and eight-bit-graphic, but that's
489 intentional. */
490 int multibyte = len < SBYTES (string);
491 unsigned char *str = SDATA (string);
492 int i = 0, i_byte = 0;
493 int width = 0;
494 struct Lisp_Char_Table *dp = buffer_display_table ();
496 while (i < len)
498 int chars, bytes, thiswidth;
499 Lisp_Object val;
500 int cmp_id;
501 EMACS_INT ignore, end;
503 if (find_composition (i, -1, &ignore, &end, &val, string)
504 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
505 >= 0))
507 thiswidth = composition_table[cmp_id]->width;
508 chars = end - i;
509 bytes = string_char_to_byte (string, end) - i_byte;
511 else
513 int c;
515 if (multibyte)
516 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
517 else
518 c = str[i_byte], bytes = 1;
519 chars = 1;
520 if (dp)
522 val = DISP_CHAR_VECTOR (dp, c);
523 if (VECTORP (val))
524 thiswidth = XVECTOR (val)->size;
525 else
526 thiswidth = CHAR_WIDTH (c);
528 else
530 thiswidth = CHAR_WIDTH (c);
534 if (precision > 0
535 && (width + thiswidth > precision))
537 *nchars = i;
538 *nbytes = i_byte;
539 return width;
541 i += chars;
542 i_byte += bytes;
543 width += thiswidth;
546 if (precision > 0)
548 *nchars = i;
549 *nbytes = i_byte;
552 return width;
555 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
556 doc: /* Return width of STRING when displayed in the current buffer.
557 Width is measured by how many columns it occupies on the screen.
558 When calculating width of a multibyte character in STRING,
559 only the base leading-code is considered; the validity of
560 the following bytes is not checked. Tabs in STRING are always
561 taken to occupy `tab-width' columns.
562 usage: (string-width STRING) */)
563 (str)
564 Lisp_Object str;
566 Lisp_Object val;
568 CHECK_STRING (str);
569 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
570 return val;
573 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
574 doc: /* Return the direction of CHAR.
575 The returned value is 0 for left-to-right and 1 for right-to-left.
576 usage: (char-direction CHAR) */)
577 (ch)
578 Lisp_Object ch;
580 int c;
582 CHECK_CHARACTER (ch);
583 c = XINT (ch);
584 return CHAR_TABLE_REF (Vchar_direction_table, c);
587 /* Return the number of characters in the NBYTES bytes at PTR.
588 This works by looking at the contents and checking for multibyte
589 sequences while assuming that there's no invalid sequence.
590 However, if the current buffer has enable-multibyte-characters =
591 nil, we treat each byte as a character. */
593 EMACS_INT
594 chars_in_text (ptr, nbytes)
595 const unsigned char *ptr;
596 EMACS_INT nbytes;
598 /* current_buffer is null at early stages of Emacs initialization. */
599 if (current_buffer == 0
600 || NILP (current_buffer->enable_multibyte_characters))
601 return nbytes;
603 return multibyte_chars_in_text (ptr, nbytes);
606 /* Return the number of characters in the NBYTES bytes at PTR.
607 This works by looking at the contents and checking for multibyte
608 sequences while assuming that there's no invalid sequence. It
609 ignores enable-multibyte-characters. */
611 EMACS_INT
612 multibyte_chars_in_text (ptr, nbytes)
613 const unsigned char *ptr;
614 EMACS_INT nbytes;
616 const unsigned char *endp = ptr + nbytes;
617 int chars = 0;
619 while (ptr < endp)
621 int len = MULTIBYTE_LENGTH (ptr, endp);
623 if (len == 0)
624 abort ();
625 ptr += len;
626 chars++;
629 return chars;
632 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
633 characters and bytes in it, and store them in *NCHARS and *NBYTES
634 respectively. On counting bytes, pay attention to that 8-bit
635 characters not constructing a valid multibyte sequence are
636 represented by 2-byte in a multibyte text. */
638 void
639 parse_str_as_multibyte (str, len, nchars, nbytes)
640 const unsigned char *str;
641 int len, *nchars, *nbytes;
643 const unsigned char *endp = str + len;
644 int n, chars = 0, bytes = 0;
646 if (len >= MAX_MULTIBYTE_LENGTH)
648 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
649 while (str < adjusted_endp)
651 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
652 str += n, bytes += n;
653 else
654 str++, bytes += 2;
655 chars++;
658 while (str < endp)
660 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
661 str += n, bytes += n;
662 else
663 str++, bytes += 2;
664 chars++;
667 *nchars = chars;
668 *nbytes = bytes;
669 return;
672 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
673 It actually converts only such 8-bit characters that don't contruct
674 a multibyte sequence to multibyte forms of Latin-1 characters. If
675 NCHARS is nonzero, set *NCHARS to the number of characters in the
676 text. It is assured that we can use LEN bytes at STR as a work
677 area and that is enough. Return the number of bytes of the
678 resulting text. */
681 str_as_multibyte (str, len, nbytes, nchars)
682 unsigned char *str;
683 int len, nbytes, *nchars;
685 unsigned char *p = str, *endp = str + nbytes;
686 unsigned char *to;
687 int chars = 0;
688 int n;
690 if (nbytes >= MAX_MULTIBYTE_LENGTH)
692 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
693 while (p < adjusted_endp
694 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
695 p += n, chars++;
697 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
698 p += n, chars++;
699 if (nchars)
700 *nchars = chars;
701 if (p == endp)
702 return nbytes;
704 to = p;
705 nbytes = endp - p;
706 endp = str + len;
707 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
708 p = endp - nbytes;
710 if (nbytes >= MAX_MULTIBYTE_LENGTH)
712 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
713 while (p < adjusted_endp)
715 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
717 while (n--)
718 *to++ = *p++;
720 else
722 int c = *p++;
723 c = BYTE8_TO_CHAR (c);
724 to += CHAR_STRING (c, to);
727 chars++;
729 while (p < endp)
731 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
733 while (n--)
734 *to++ = *p++;
736 else
738 int c = *p++;
739 c = BYTE8_TO_CHAR (c);
740 to += CHAR_STRING (c, to);
742 chars++;
744 if (nchars)
745 *nchars = chars;
746 return (to - str);
749 /* Parse unibyte string at STR of LEN bytes, and return the number of
750 bytes it may ocupy when converted to multibyte string by
751 `str_to_multibyte'. */
754 parse_str_to_multibyte (str, len)
755 unsigned char *str;
756 int len;
758 unsigned char *endp = str + len;
759 int bytes;
761 for (bytes = 0; str < endp; str++)
762 bytes += (*str < 0x80) ? 1 : 2;
763 return bytes;
767 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
768 that contains the same single-byte characters. It actually
769 converts all 8-bit characters to multibyte forms. It is assured
770 that we can use LEN bytes at STR as a work area and that is
771 enough. */
774 str_to_multibyte (str, len, bytes)
775 unsigned char *str;
776 int len, bytes;
778 unsigned char *p = str, *endp = str + bytes;
779 unsigned char *to;
781 while (p < endp && *p < 0x80) p++;
782 if (p == endp)
783 return bytes;
784 to = p;
785 bytes = endp - p;
786 endp = str + len;
787 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
788 p = endp - bytes;
789 while (p < endp)
791 int c = *p++;
793 if (c >= 0x80)
794 c = BYTE8_TO_CHAR (c);
795 to += CHAR_STRING (c, to);
797 return (to - str);
800 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
801 actually converts characters in the range 0x80..0xFF to
802 unibyte. */
805 str_as_unibyte (str, bytes)
806 unsigned char *str;
807 int bytes;
809 const unsigned char *p = str, *endp = str + bytes;
810 unsigned char *to;
811 int c, len;
813 while (p < endp)
815 c = *p;
816 len = BYTES_BY_CHAR_HEAD (c);
817 if (CHAR_BYTE8_HEAD_P (c))
818 break;
819 p += len;
821 to = str + (p - str);
822 while (p < endp)
824 c = *p;
825 len = BYTES_BY_CHAR_HEAD (c);
826 if (CHAR_BYTE8_HEAD_P (c))
828 c = STRING_CHAR_ADVANCE (p);
829 *to++ = CHAR_TO_BYTE8 (c);
831 else
833 while (len--) *to++ = *p++;
836 return (to - str);
839 /* Convert eight-bit chars in SRC (in multibyte form) to the
840 corresponding byte and store in DST. CHARS is the number of
841 characters in SRC. The value is the number of bytes stored in DST.
842 Usually, the value is the same as CHARS, but is less than it if SRC
843 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
844 is nonzero, a Latin-1 character is accepted and converted to a byte
845 of that character code.
846 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
848 EMACS_INT
849 str_to_unibyte (src, dst, chars, accept_latin_1)
850 const unsigned char *src;
851 unsigned char *dst;
852 EMACS_INT chars;
853 int accept_latin_1;
855 EMACS_INT i;
857 for (i = 0; i < chars; i++)
859 int c = STRING_CHAR_ADVANCE (src);
861 if (CHAR_BYTE8_P (c))
862 c = CHAR_TO_BYTE8 (c);
863 else if (! ASCII_CHAR_P (c)
864 && (! accept_latin_1 || c >= 0x100))
865 return i;
866 *dst++ = c;
868 return i;
873 string_count_byte8 (string)
874 Lisp_Object string;
876 int multibyte = STRING_MULTIBYTE (string);
877 int nbytes = SBYTES (string);
878 unsigned char *p = SDATA (string);
879 unsigned char *pend = p + nbytes;
880 int count = 0;
881 int c, len;
883 if (multibyte)
884 while (p < pend)
886 c = *p;
887 len = BYTES_BY_CHAR_HEAD (c);
889 if (CHAR_BYTE8_HEAD_P (c))
890 count++;
891 p += len;
893 else
894 while (p < pend)
896 if (*p++ >= 0x80)
897 count++;
899 return count;
903 Lisp_Object
904 string_escape_byte8 (string)
905 Lisp_Object string;
907 int nchars = SCHARS (string);
908 int nbytes = SBYTES (string);
909 int multibyte = STRING_MULTIBYTE (string);
910 int byte8_count;
911 const unsigned char *src, *src_end;
912 unsigned char *dst;
913 Lisp_Object val;
914 int c, len;
916 if (multibyte && nchars == nbytes)
917 return string;
919 byte8_count = string_count_byte8 (string);
921 if (byte8_count == 0)
922 return string;
924 if (multibyte)
925 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
926 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
927 nbytes + byte8_count * 2);
928 else
929 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
930 val = make_uninit_string (nbytes + byte8_count * 3);
932 src = SDATA (string);
933 src_end = src + nbytes;
934 dst = SDATA (val);
935 if (multibyte)
936 while (src < src_end)
938 c = *src;
939 len = BYTES_BY_CHAR_HEAD (c);
941 if (CHAR_BYTE8_HEAD_P (c))
943 c = STRING_CHAR_ADVANCE (src);
944 c = CHAR_TO_BYTE8 (c);
945 sprintf ((char *) dst, "\\%03o", c);
946 dst += 4;
948 else
949 while (len--) *dst++ = *src++;
951 else
952 while (src < src_end)
954 c = *src++;
955 if (c >= 0x80)
957 sprintf ((char *) dst, "\\%03o", c);
958 dst += 4;
960 else
961 *dst++ = c;
963 return val;
967 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
968 doc: /*
969 Concatenate all the argument characters and make the result a string.
970 usage: (string &rest CHARACTERS) */)
971 (n, args)
972 int n;
973 Lisp_Object *args;
975 int i;
976 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
977 unsigned char *p = buf;
978 int c;
980 for (i = 0; i < n; i++)
982 CHECK_CHARACTER (args[i]);
983 c = XINT (args[i]);
984 p += CHAR_STRING (c, p);
987 return make_string_from_bytes ((char *) buf, n, p - buf);
990 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
991 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
992 usage: (unibyte-string &rest BYTES) */)
993 (n, args)
994 int n;
995 Lisp_Object *args;
997 int i;
998 unsigned char *buf = (unsigned char *) alloca (n);
999 unsigned char *p = buf;
1000 unsigned c;
1002 for (i = 0; i < n; i++)
1004 CHECK_NATNUM (args[i]);
1005 c = XFASTINT (args[i]);
1006 if (c >= 256)
1007 args_out_of_range_3 (args[i], make_number (0), make_number (255));
1008 *p++ = c;
1011 return make_string_from_bytes ((char *) buf, n, p - buf);
1014 DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
1015 Schar_resolve_modifiers, 1, 1, 0,
1016 doc: /* Resolve modifiers in the character CHAR.
1017 The value is a character with modifiers resolved into the character
1018 code. Unresolved modifiers are kept in the value.
1019 usage: (char-resolve-modifers CHAR) */)
1020 (character)
1021 Lisp_Object character;
1023 int c;
1025 CHECK_NUMBER (character);
1026 c = XINT (character);
1027 return make_number (char_resolve_modifier_mask (c));
1030 void
1031 init_character_once ()
1035 #ifdef emacs
1037 void
1038 syms_of_character ()
1040 DEFSYM (Qcharacterp, "characterp");
1041 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1043 staticpro (&Vchar_unify_table);
1044 Vchar_unify_table = Qnil;
1046 defsubr (&Smax_char);
1047 defsubr (&Scharacterp);
1048 defsubr (&Sunibyte_char_to_multibyte);
1049 defsubr (&Smultibyte_char_to_unibyte);
1050 defsubr (&Schar_bytes);
1051 defsubr (&Schar_width);
1052 defsubr (&Sstring_width);
1053 defsubr (&Schar_direction);
1054 defsubr (&Sstring);
1055 defsubr (&Sunibyte_string);
1056 defsubr (&Schar_resolve_modifiers);
1058 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1059 doc: /*
1060 Vector recording all translation tables ever defined.
1061 Each element is a pair (SYMBOL . TABLE) relating the table to the
1062 symbol naming it. The ID of a translation table is an index into this vector. */);
1063 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1065 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1066 doc: /*
1067 A char-table for characters which invoke auto-filling.
1068 Such characters have value t in this table. */);
1069 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1070 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1071 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1073 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1074 doc: /*
1075 A char-table for width (columns) of each character. */);
1076 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1077 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1078 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1079 make_number (4));
1081 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1082 doc: /* A char-table for direction of each character. */);
1083 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1085 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1086 doc: /* A char-table for each printable character. */);
1087 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1088 Fset_char_table_range (Vprintable_chars,
1089 Fcons (make_number (32), make_number (126)), Qt);
1090 Fset_char_table_range (Vprintable_chars,
1091 Fcons (make_number (160),
1092 make_number (MAX_5_BYTE_CHAR)), Qt);
1094 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1095 doc: /* Char table of script symbols.
1096 It has one extra slot whose value is a list of script symbols. */);
1098 /* Intern this now in case it isn't already done.
1099 Setting this variable twice is harmless.
1100 But don't staticpro it here--that is done in alloc.c. */
1101 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1102 DEFSYM (Qchar_script_table, "char-script-table");
1103 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1104 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1106 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1107 doc: /* Alist of scripts vs the representative characters.
1108 Each element is a cons (SCRIPT . CHARS), where SCRIPT is a script name symbol,
1109 CHARS is a list or a vector of characters.
1110 If it is a list, all characters in the list is necessary for supporting SCRIPT.
1111 If it is a vector, one of the characters in the vector is necessary.
1112 This variable is used to find a font for a specific script. */);
1113 Vscript_representative_chars = Qnil;
1115 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1116 doc: /* Char table of Unicode's "General Category".
1117 All Unicode characters has one of the following values (symbol):
1118 Lw, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1119 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1120 See The Unicode Standard for the meaning of those values. */);
1121 /* The correct char-table is setup in characters.el. */
1122 Vunicode_category_table = Qnil;
1125 #endif /* emacs */
1127 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1128 (do not change this comment) */