Merge from emacs-24; up to 2013-01-03T02:31:36Z!rgm@gnu.org
[emacs.git] / src / character.c
blob6fefb6e8824f639ee8ec0c683949e626fb50234e
1 /* Basic character support.
3 Copyright (C) 2001-2013 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
10 This file is part of GNU Emacs.
12 GNU Emacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25 /* At first, see the document in `character.h' to understand the code
26 in this file. */
28 #ifdef emacs
29 #include <config.h>
30 #endif
32 #define CHARACTER_INLINE EXTERN_INLINE
34 #include <stdio.h>
36 #ifdef emacs
38 #include <sys/types.h>
39 #include <intprops.h>
40 #include "lisp.h"
41 #include "character.h"
42 #include "buffer.h"
43 #include "charset.h"
44 #include "composite.h"
45 #include "disptab.h"
47 #else /* not emacs */
49 #include "mulelib.h"
51 #endif /* emacs */
53 Lisp_Object Qcharacterp;
55 static Lisp_Object Qauto_fill_chars;
57 /* Char-table of information about which character to unify to which
58 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
59 Lisp_Object Vchar_unify_table;
61 static Lisp_Object Qchar_script_table;
65 /* If character code C has modifier masks, reflect them to the
66 character code if possible. Return the resulting code. */
68 EMACS_INT
69 char_resolve_modifier_mask (EMACS_INT c)
71 /* A non-ASCII character can't reflect modifier bits to the code. */
72 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
73 return c;
75 /* For Meta, Shift, and Control modifiers, we need special care. */
76 if (c & CHAR_SHIFT)
78 /* Shift modifier is valid only with [A-Za-z]. */
79 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
80 c &= ~CHAR_SHIFT;
81 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
82 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
83 /* Shift modifier for control characters and SPC is ignored. */
84 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
85 c &= ~CHAR_SHIFT;
87 if (c & CHAR_CTL)
89 /* Simulate the code in lread.c. */
90 /* Allow `\C- ' and `\C-?'. */
91 if ((c & 0377) == ' ')
92 c &= ~0177 & ~ CHAR_CTL;
93 else if ((c & 0377) == '?')
94 c = 0177 | (c & ~0177 & ~CHAR_CTL);
95 /* ASCII control chars are made from letters (both cases),
96 as well as the non-letters within 0100...0137. */
97 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
98 c &= (037 | (~0177 & ~CHAR_CTL));
99 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
100 c &= (037 | (~0177 & ~CHAR_CTL));
102 #if 0 /* This is outside the scope of this function. (bug#4751) */
103 if (c & CHAR_META)
105 /* Move the meta bit to the right place for a string. */
106 c = (c & ~CHAR_META) | 0x80;
108 #endif
110 return c;
114 /* Store multibyte form of character C at P. If C has modifier bits,
115 handle them appropriately. */
118 char_string (unsigned int c, unsigned char *p)
120 int bytes;
122 if (c & CHAR_MODIFIER_MASK)
124 c = char_resolve_modifier_mask (c);
125 /* If C still has any modifier bits, just ignore it. */
126 c &= ~CHAR_MODIFIER_MASK;
129 if (c <= MAX_3_BYTE_CHAR)
131 bytes = CHAR_STRING (c, p);
133 else if (c <= MAX_4_BYTE_CHAR)
135 p[0] = (0xF0 | (c >> 18));
136 p[1] = (0x80 | ((c >> 12) & 0x3F));
137 p[2] = (0x80 | ((c >> 6) & 0x3F));
138 p[3] = (0x80 | (c & 0x3F));
139 bytes = 4;
141 else if (c <= MAX_5_BYTE_CHAR)
143 p[0] = 0xF8;
144 p[1] = (0x80 | ((c >> 18) & 0x0F));
145 p[2] = (0x80 | ((c >> 12) & 0x3F));
146 p[3] = (0x80 | ((c >> 6) & 0x3F));
147 p[4] = (0x80 | (c & 0x3F));
148 bytes = 5;
150 else if (c <= MAX_CHAR)
152 c = CHAR_TO_BYTE8 (c);
153 bytes = BYTE8_STRING (c, p);
155 else
156 error ("Invalid character: %x", c);
158 return bytes;
162 /* Return a character whose multibyte form is at P. If LEN is not
163 NULL, it must be a pointer to integer. In that case, set *LEN to
164 the byte length of the multibyte form. If ADVANCED is not NULL, it
165 must be a pointer to unsigned char. In that case, set *ADVANCED to
166 the ending address (i.e., the starting address of the next
167 character) of the multibyte form. */
170 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
172 int c;
173 const unsigned char *saved_p = p;
175 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
177 /* 1-, 2-, and 3-byte sequences can be handled by the macro. */
178 c = STRING_CHAR_ADVANCE (p);
180 else if (! (*p & 0x08))
182 /* A 4-byte sequence of this form:
183 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
184 c = ((((p)[0] & 0x7) << 18)
185 | (((p)[1] & 0x3F) << 12)
186 | (((p)[2] & 0x3F) << 6)
187 | ((p)[3] & 0x3F));
188 p += 4;
190 else
192 /* A 5-byte sequence of this form:
194 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
196 Note that the top 4 `x's are always 0, so shifting p[1] can
197 never exceed the maximum valid character codepoint. */
198 c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
199 (((p)[1] & 0x3F) << 18)
200 | (((p)[2] & 0x3F) << 12)
201 | (((p)[3] & 0x3F) << 6)
202 | ((p)[4] & 0x3F));
203 p += 5;
206 if (len)
207 *len = p - saved_p;
208 if (advanced)
209 *advanced = p;
210 return c;
214 /* Translate character C by translation table TABLE. If no translation is
215 found in TABLE, return the untranslated character. If TABLE is a list,
216 elements are char tables. In that case, recursively translate C by all the
217 tables in the list. */
220 translate_char (Lisp_Object table, int c)
222 if (CHAR_TABLE_P (table))
224 Lisp_Object ch;
226 ch = CHAR_TABLE_REF (table, c);
227 if (CHARACTERP (ch))
228 c = XINT (ch);
230 else
232 for (; CONSP (table); table = XCDR (table))
233 c = translate_char (XCAR (table), c);
235 return c;
238 /* Convert ASCII or 8-bit character C to unibyte. If C is none of
239 them, return (C & 0xFF). */
242 multibyte_char_to_unibyte (int c)
244 if (c < 0x80)
245 return c;
246 if (CHAR_BYTE8_P (c))
247 return CHAR_TO_BYTE8 (c);
248 return (c & 0xFF);
251 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
252 by charset_unibyte. */
255 multibyte_char_to_unibyte_safe (int c)
257 if (c < 0x80)
258 return c;
259 if (CHAR_BYTE8_P (c))
260 return CHAR_TO_BYTE8 (c);
261 return -1;
264 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
265 doc: /* Return non-nil if OBJECT is a character.
266 In Emacs Lisp, characters are represented by character codes, which
267 are non-negative integers. The function `max-char' returns the
268 maximum character code.
269 usage: (characterp OBJECT) */)
270 (Lisp_Object object, Lisp_Object ignore)
272 return (CHARACTERP (object) ? Qt : Qnil);
275 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
276 doc: /* Return the character of the maximum code. */)
277 (void)
279 return make_number (MAX_CHAR);
282 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
283 Sunibyte_char_to_multibyte, 1, 1, 0,
284 doc: /* Convert the byte CH to multibyte character. */)
285 (Lisp_Object ch)
287 int c;
289 CHECK_CHARACTER (ch);
290 c = XFASTINT (ch);
291 if (c >= 0x100)
292 error ("Not a unibyte character: %d", c);
293 MAKE_CHAR_MULTIBYTE (c);
294 return make_number (c);
297 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
298 Smultibyte_char_to_unibyte, 1, 1, 0,
299 doc: /* Convert the multibyte character CH to a byte.
300 If the multibyte character does not represent a byte, return -1. */)
301 (Lisp_Object ch)
303 int cm;
305 CHECK_CHARACTER (ch);
306 cm = XFASTINT (ch);
307 if (cm < 256)
308 /* Can't distinguish a byte read from a unibyte buffer from
309 a latin1 char, so let's let it slide. */
310 return ch;
311 else
313 int cu = CHAR_TO_BYTE_SAFE (cm);
314 return make_number (cu);
319 /* Return width (columns) of C considering the buffer display table DP. */
321 static ptrdiff_t
322 char_width (int c, struct Lisp_Char_Table *dp)
324 ptrdiff_t width = CHAR_WIDTH (c);
326 if (dp)
328 Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
329 int i;
331 if (VECTORP (disp))
332 for (i = 0, width = 0; i < ASIZE (disp); i++)
334 ch = AREF (disp, i);
335 if (CHARACTERP (ch))
337 int w = CHAR_WIDTH (XFASTINT (ch));
338 if (INT_ADD_OVERFLOW (width, w))
339 string_overflow ();
340 width += w;
344 return width;
348 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
349 doc: /* Return width of CHAR when displayed in the current buffer.
350 The width is measured by how many columns it occupies on the screen.
351 Tab is taken to occupy `tab-width' columns.
352 usage: (char-width CHAR) */)
353 (Lisp_Object ch)
355 int c;
356 ptrdiff_t width;
358 CHECK_CHARACTER (ch);
359 c = XINT (ch);
360 width = char_width (c, buffer_display_table ());
361 return make_number (width);
364 /* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
371 ptrdiff_t
372 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
373 ptrdiff_t *nchars, ptrdiff_t *nbytes)
375 ptrdiff_t i = 0, i_byte = 0;
376 ptrdiff_t width = 0;
377 struct Lisp_Char_Table *dp = buffer_display_table ();
379 while (i_byte < len)
381 int bytes;
382 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
383 ptrdiff_t thiswidth = char_width (c, dp);
385 if (precision <= 0)
387 if (INT_ADD_OVERFLOW (width, thiswidth))
388 string_overflow ();
390 else if (precision - width < thiswidth)
392 *nchars = i;
393 *nbytes = i_byte;
394 return width;
396 i++;
397 i_byte += bytes;
398 width += thiswidth;
401 if (precision > 0)
403 *nchars = i;
404 *nbytes = i_byte;
407 return width;
410 /* Return width of string STR of length LEN when displayed in the
411 current buffer. The width is measured by how many columns it
412 occupies on the screen. */
414 ptrdiff_t
415 strwidth (const char *str, ptrdiff_t len)
417 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
420 /* Return width of Lisp string STRING when displayed in the current
421 buffer. The width is measured by how many columns it occupies on
422 the screen while paying attention to compositions. If PRECISION >
423 0, return the width of longest substring that doesn't exceed
424 PRECISION, and set number of characters and bytes of the substring
425 in *NCHARS and *NBYTES respectively. */
427 ptrdiff_t
428 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
429 ptrdiff_t *nchars, ptrdiff_t *nbytes)
431 ptrdiff_t len = SCHARS (string);
432 /* This set multibyte to 0 even if STRING is multibyte when it
433 contains only ascii and eight-bit-graphic, but that's
434 intentional. */
435 bool multibyte = len < SBYTES (string);
436 unsigned char *str = SDATA (string);
437 ptrdiff_t i = 0, i_byte = 0;
438 ptrdiff_t width = 0;
439 struct Lisp_Char_Table *dp = buffer_display_table ();
441 while (i < len)
443 ptrdiff_t chars, bytes, thiswidth;
444 Lisp_Object val;
445 ptrdiff_t cmp_id;
446 ptrdiff_t ignore, end;
448 if (find_composition (i, -1, &ignore, &end, &val, string)
449 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
450 >= 0))
452 thiswidth = composition_table[cmp_id]->width;
453 chars = end - i;
454 bytes = string_char_to_byte (string, end) - i_byte;
456 else
458 int c;
460 if (multibyte)
462 int cbytes;
463 c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
464 bytes = cbytes;
466 else
467 c = str[i_byte], bytes = 1;
468 chars = 1;
469 thiswidth = char_width (c, dp);
472 if (precision <= 0)
474 #ifdef emacs
475 if (INT_ADD_OVERFLOW (width, thiswidth))
476 string_overflow ();
477 #endif
479 else if (precision - width < thiswidth)
481 *nchars = i;
482 *nbytes = i_byte;
483 return width;
485 i += chars;
486 i_byte += bytes;
487 width += thiswidth;
490 if (precision > 0)
492 *nchars = i;
493 *nbytes = i_byte;
496 return width;
499 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
500 doc: /* Return width of STRING when displayed in the current buffer.
501 Width is measured by how many columns it occupies on the screen.
502 When calculating width of a multibyte character in STRING,
503 only the base leading-code is considered; the validity of
504 the following bytes is not checked. Tabs in STRING are always
505 taken to occupy `tab-width' columns.
506 usage: (string-width STRING) */)
507 (Lisp_Object str)
509 Lisp_Object val;
511 CHECK_STRING (str);
512 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
513 return val;
516 /* Return the number of characters in the NBYTES bytes at PTR.
517 This works by looking at the contents and checking for multibyte
518 sequences while assuming that there's no invalid sequence.
519 However, if the current buffer has enable-multibyte-characters =
520 nil, we treat each byte as a character. */
522 ptrdiff_t
523 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
525 /* current_buffer is null at early stages of Emacs initialization. */
526 if (current_buffer == 0
527 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
528 return nbytes;
530 return multibyte_chars_in_text (ptr, nbytes);
533 /* Return the number of characters in the NBYTES bytes at PTR.
534 This works by looking at the contents and checking for multibyte
535 sequences while assuming that there's no invalid sequence. It
536 ignores enable-multibyte-characters. */
538 ptrdiff_t
539 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
541 const unsigned char *endp = ptr + nbytes;
542 ptrdiff_t chars = 0;
544 while (ptr < endp)
546 int len = MULTIBYTE_LENGTH (ptr, endp);
548 if (len == 0)
549 emacs_abort ();
550 ptr += len;
551 chars++;
554 return chars;
557 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
558 characters and bytes in it, and store them in *NCHARS and *NBYTES
559 respectively. On counting bytes, pay attention to that 8-bit
560 characters not constructing a valid multibyte sequence are
561 represented by 2-byte in a multibyte text. */
563 void
564 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
565 ptrdiff_t *nchars, ptrdiff_t *nbytes)
567 const unsigned char *endp = str + len;
568 int n;
569 ptrdiff_t chars = 0, bytes = 0;
571 if (len >= MAX_MULTIBYTE_LENGTH)
573 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
574 while (str < adjusted_endp)
576 if (! CHAR_BYTE8_HEAD_P (*str)
577 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
578 str += n, bytes += n;
579 else
580 str++, bytes += 2;
581 chars++;
584 while (str < endp)
586 if (! CHAR_BYTE8_HEAD_P (*str)
587 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
588 str += n, bytes += n;
589 else
590 str++, bytes += 2;
591 chars++;
594 *nchars = chars;
595 *nbytes = bytes;
596 return;
599 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
600 It actually converts only such 8-bit characters that don't construct
601 a multibyte sequence to multibyte forms of Latin-1 characters. If
602 NCHARS is nonzero, set *NCHARS to the number of characters in the
603 text. It is assured that we can use LEN bytes at STR as a work
604 area and that is enough. Return the number of bytes of the
605 resulting text. */
607 ptrdiff_t
608 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
609 ptrdiff_t *nchars)
611 unsigned char *p = str, *endp = str + nbytes;
612 unsigned char *to;
613 ptrdiff_t chars = 0;
614 int n;
616 if (nbytes >= MAX_MULTIBYTE_LENGTH)
618 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
619 while (p < adjusted_endp
620 && ! CHAR_BYTE8_HEAD_P (*p)
621 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
622 p += n, chars++;
624 while (p < endp
625 && ! CHAR_BYTE8_HEAD_P (*p)
626 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
627 p += n, chars++;
628 if (nchars)
629 *nchars = chars;
630 if (p == endp)
631 return nbytes;
633 to = p;
634 nbytes = endp - p;
635 endp = str + len;
636 memmove (endp - nbytes, p, nbytes);
637 p = endp - nbytes;
639 if (nbytes >= MAX_MULTIBYTE_LENGTH)
641 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
642 while (p < adjusted_endp)
644 if (! CHAR_BYTE8_HEAD_P (*p)
645 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
647 while (n--)
648 *to++ = *p++;
650 else
652 int c = *p++;
653 c = BYTE8_TO_CHAR (c);
654 to += CHAR_STRING (c, to);
657 chars++;
659 while (p < endp)
661 if (! CHAR_BYTE8_HEAD_P (*p)
662 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
664 while (n--)
665 *to++ = *p++;
667 else
669 int c = *p++;
670 c = BYTE8_TO_CHAR (c);
671 to += CHAR_STRING (c, to);
673 chars++;
675 if (nchars)
676 *nchars = chars;
677 return (to - str);
680 /* Parse unibyte string at STR of LEN bytes, and return the number of
681 bytes it may occupy when converted to multibyte string by
682 `str_to_multibyte'. */
684 ptrdiff_t
685 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
687 const unsigned char *endp = str + len;
688 ptrdiff_t bytes;
690 for (bytes = 0; str < endp; str++)
692 int n = *str < 0x80 ? 1 : 2;
693 if (INT_ADD_OVERFLOW (bytes, n))
694 string_overflow ();
695 bytes += n;
697 return bytes;
701 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
702 that contains the same single-byte characters. It actually
703 converts all 8-bit characters to multibyte forms. It is assured
704 that we can use LEN bytes at STR as a work area and that is
705 enough. */
707 ptrdiff_t
708 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
710 unsigned char *p = str, *endp = str + bytes;
711 unsigned char *to;
713 while (p < endp && *p < 0x80) p++;
714 if (p == endp)
715 return bytes;
716 to = p;
717 bytes = endp - p;
718 endp = str + len;
719 memmove (endp - bytes, p, bytes);
720 p = endp - bytes;
721 while (p < endp)
723 int c = *p++;
725 if (c >= 0x80)
726 c = BYTE8_TO_CHAR (c);
727 to += CHAR_STRING (c, to);
729 return (to - str);
732 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
733 actually converts characters in the range 0x80..0xFF to
734 unibyte. */
736 ptrdiff_t
737 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
739 const unsigned char *p = str, *endp = str + bytes;
740 unsigned char *to;
741 int c, len;
743 while (p < endp)
745 c = *p;
746 len = BYTES_BY_CHAR_HEAD (c);
747 if (CHAR_BYTE8_HEAD_P (c))
748 break;
749 p += len;
751 to = str + (p - str);
752 while (p < endp)
754 c = *p;
755 len = BYTES_BY_CHAR_HEAD (c);
756 if (CHAR_BYTE8_HEAD_P (c))
758 c = STRING_CHAR_ADVANCE (p);
759 *to++ = CHAR_TO_BYTE8 (c);
761 else
763 while (len--) *to++ = *p++;
766 return (to - str);
769 /* Convert eight-bit chars in SRC (in multibyte form) to the
770 corresponding byte and store in DST. CHARS is the number of
771 characters in SRC. The value is the number of bytes stored in DST.
772 Usually, the value is the same as CHARS, but is less than it if SRC
773 contains a non-ASCII, non-eight-bit character. */
775 ptrdiff_t
776 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
778 ptrdiff_t i;
780 for (i = 0; i < chars; i++)
782 int c = STRING_CHAR_ADVANCE (src);
784 if (CHAR_BYTE8_P (c))
785 c = CHAR_TO_BYTE8 (c);
786 else if (! ASCII_CHAR_P (c))
787 return i;
788 *dst++ = c;
790 return i;
794 static ptrdiff_t
795 string_count_byte8 (Lisp_Object string)
797 bool multibyte = STRING_MULTIBYTE (string);
798 ptrdiff_t nbytes = SBYTES (string);
799 unsigned char *p = SDATA (string);
800 unsigned char *pend = p + nbytes;
801 ptrdiff_t count = 0;
802 int c, len;
804 if (multibyte)
805 while (p < pend)
807 c = *p;
808 len = BYTES_BY_CHAR_HEAD (c);
810 if (CHAR_BYTE8_HEAD_P (c))
811 count++;
812 p += len;
814 else
815 while (p < pend)
817 if (*p++ >= 0x80)
818 count++;
820 return count;
824 Lisp_Object
825 string_escape_byte8 (Lisp_Object string)
827 ptrdiff_t nchars = SCHARS (string);
828 ptrdiff_t nbytes = SBYTES (string);
829 bool multibyte = STRING_MULTIBYTE (string);
830 ptrdiff_t byte8_count;
831 const unsigned char *src, *src_end;
832 unsigned char *dst;
833 Lisp_Object val;
834 int c, len;
836 if (multibyte && nchars == nbytes)
837 return string;
839 byte8_count = string_count_byte8 (string);
841 if (byte8_count == 0)
842 return string;
844 if (multibyte)
846 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
847 || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
848 string_overflow ();
850 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
851 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
852 nbytes + byte8_count * 2);
854 else
856 if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
857 string_overflow ();
859 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
860 val = make_uninit_string (nbytes + byte8_count * 3);
863 src = SDATA (string);
864 src_end = src + nbytes;
865 dst = SDATA (val);
866 if (multibyte)
867 while (src < src_end)
869 c = *src;
870 len = BYTES_BY_CHAR_HEAD (c);
872 if (CHAR_BYTE8_HEAD_P (c))
874 c = STRING_CHAR_ADVANCE (src);
875 c = CHAR_TO_BYTE8 (c);
876 dst += sprintf ((char *) dst, "\\%03o", c);
878 else
879 while (len--) *dst++ = *src++;
881 else
882 while (src < src_end)
884 c = *src++;
885 if (c >= 0x80)
886 dst += sprintf ((char *) dst, "\\%03o", c);
887 else
888 *dst++ = c;
890 return val;
894 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
895 doc: /*
896 Concatenate all the argument characters and make the result a string.
897 usage: (string &rest CHARACTERS) */)
898 (ptrdiff_t n, Lisp_Object *args)
900 ptrdiff_t i;
901 int c;
902 unsigned char *buf, *p;
903 Lisp_Object str;
904 USE_SAFE_ALLOCA;
906 SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
907 p = buf;
909 for (i = 0; i < n; i++)
911 CHECK_CHARACTER (args[i]);
912 c = XINT (args[i]);
913 p += CHAR_STRING (c, p);
916 str = make_string_from_bytes ((char *) buf, n, p - buf);
917 SAFE_FREE ();
918 return str;
921 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
922 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
923 usage: (unibyte-string &rest BYTES) */)
924 (ptrdiff_t n, Lisp_Object *args)
926 ptrdiff_t i;
927 Lisp_Object str;
928 USE_SAFE_ALLOCA;
929 unsigned char *buf = SAFE_ALLOCA (n);
930 unsigned char *p = buf;
932 for (i = 0; i < n; i++)
934 CHECK_RANGED_INTEGER (args[i], 0, 255);
935 *p++ = XINT (args[i]);
938 str = make_string_from_bytes ((char *) buf, n, p - buf);
939 SAFE_FREE ();
940 return str;
943 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
944 Schar_resolve_modifiers, 1, 1, 0,
945 doc: /* Resolve modifiers in the character CHAR.
946 The value is a character with modifiers resolved into the character
947 code. Unresolved modifiers are kept in the value.
948 usage: (char-resolve-modifiers CHAR) */)
949 (Lisp_Object character)
951 EMACS_INT c;
953 CHECK_NUMBER (character);
954 c = XINT (character);
955 return make_number (char_resolve_modifier_mask (c));
958 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
959 doc: /* Return a byte value of a character at point.
960 Optional 1st arg POSITION, if non-nil, is a position of a character to get
961 a byte value.
962 Optional 2nd arg STRING, if non-nil, is a string of which first
963 character is a target to get a byte value. In this case, POSITION, if
964 non-nil, is an index of a target character in the string.
966 If the current buffer (or STRING) is multibyte, and the target
967 character is not ASCII nor 8-bit character, an error is signaled. */)
968 (Lisp_Object position, Lisp_Object string)
970 int c;
971 ptrdiff_t pos;
972 unsigned char *p;
974 if (NILP (string))
976 if (NILP (position))
978 p = PT_ADDR;
980 else
982 CHECK_NUMBER_COERCE_MARKER (position);
983 if (XINT (position) < BEGV || XINT (position) >= ZV)
984 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
985 pos = XFASTINT (position);
986 p = CHAR_POS_ADDR (pos);
988 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
989 return make_number (*p);
991 else
993 CHECK_STRING (string);
994 if (NILP (position))
996 p = SDATA (string);
998 else
1000 CHECK_NATNUM (position);
1001 if (XINT (position) >= SCHARS (string))
1002 args_out_of_range (string, position);
1003 pos = XFASTINT (position);
1004 p = SDATA (string) + string_char_to_byte (string, pos);
1006 if (! STRING_MULTIBYTE (string))
1007 return make_number (*p);
1009 c = STRING_CHAR (p);
1010 if (CHAR_BYTE8_P (c))
1011 c = CHAR_TO_BYTE8 (c);
1012 else if (! ASCII_CHAR_P (c))
1013 error ("Not an ASCII nor an 8-bit character: %d", c);
1014 return make_number (c);
1017 #ifdef emacs
1019 void
1020 syms_of_character (void)
1022 DEFSYM (Qcharacterp, "characterp");
1023 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1025 staticpro (&Vchar_unify_table);
1026 Vchar_unify_table = Qnil;
1028 defsubr (&Smax_char);
1029 defsubr (&Scharacterp);
1030 defsubr (&Sunibyte_char_to_multibyte);
1031 defsubr (&Smultibyte_char_to_unibyte);
1032 defsubr (&Schar_width);
1033 defsubr (&Sstring_width);
1034 defsubr (&Sstring);
1035 defsubr (&Sunibyte_string);
1036 defsubr (&Schar_resolve_modifiers);
1037 defsubr (&Sget_byte);
1039 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
1040 doc: /*
1041 Vector recording all translation tables ever defined.
1042 Each element is a pair (SYMBOL . TABLE) relating the table to the
1043 symbol naming it. The ID of a translation table is an index into this vector. */);
1044 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1046 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1047 doc: /*
1048 A char-table for characters which invoke auto-filling.
1049 Such characters have value t in this table. */);
1050 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1051 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1052 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1054 DEFVAR_LISP ("char-width-table", Vchar_width_table,
1055 doc: /*
1056 A char-table for width (columns) of each character. */);
1057 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1058 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1059 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1060 make_number (4));
1062 DEFVAR_LISP ("printable-chars", Vprintable_chars,
1063 doc: /* A char-table for each printable character. */);
1064 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1065 Fset_char_table_range (Vprintable_chars,
1066 Fcons (make_number (32), make_number (126)), Qt);
1067 Fset_char_table_range (Vprintable_chars,
1068 Fcons (make_number (160),
1069 make_number (MAX_5_BYTE_CHAR)), Qt);
1071 DEFVAR_LISP ("char-script-table", Vchar_script_table,
1072 doc: /* Char table of script symbols.
1073 It has one extra slot whose value is a list of script symbols. */);
1075 DEFSYM (Qchar_script_table, "char-script-table");
1076 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1077 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1079 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1080 doc: /* Alist of scripts vs the representative characters.
1081 Each element is a cons (SCRIPT . CHARS).
1082 SCRIPT is a symbol representing a script or a subgroup of a script.
1083 CHARS is a list or a vector of characters.
1084 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1085 If it is a vector, one of the characters in the vector is necessary.
1086 This variable is used to find a font for a specific script. */);
1087 Vscript_representative_chars = Qnil;
1089 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1090 doc: /* Char table of Unicode's "General Category".
1091 All Unicode characters have one of the following values (symbol):
1092 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1093 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1094 See The Unicode Standard for the meaning of those values. */);
1095 /* The correct char-table is setup in characters.el. */
1096 Vunicode_category_table = Qnil;
1099 #endif /* emacs */