src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2013 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #define CHARACTER_INLINE EXTERN_INLINE
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include <intprops.h>
  40 #include "lisp.h"
  41 #include "character.h"
  42 #include "buffer.h"
  43 #include "charset.h"
  44 #include "composite.h"
  45 #include "disptab.h"
  46
  47 #else  /* not emacs */
  48
  49 #include "mulelib.h"
  50
  51 #endif /* emacs */
  52
  53 Lisp_Object Qcharacterp;
  54
  55 static Lisp_Object Qauto_fill_chars;
  56
  57 /* Char-table of information about which character to unify to which
  58    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  59 Lisp_Object Vchar_unify_table;
  60
  61 static Lisp_Object Qchar_script_table;
  62
  63 \f
  64
  65 /* If character code C has modifier masks, reflect them to the
  66    character code if possible.  Return the resulting code.  */
  67
  68 EMACS_INT
  69 char_resolve_modifier_mask (EMACS_INT c)
  70 {
  71   /* A non-ASCII character can't reflect modifier bits to the code.  */
  72   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  73     return c;
  74
  75   /* For Meta, Shift, and Control modifiers, we need special care.  */
  76   if (c & CHAR_SHIFT)
  77     {
  78       /* Shift modifier is valid only with [A-Za-z].  */
  79       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  80         c &= ~CHAR_SHIFT;
  81       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  82         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  83       /* Shift modifier for control characters and SPC is ignored.  */
  84       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  85         c &= ~CHAR_SHIFT;
  86     }
  87   if (c & CHAR_CTL)
  88     {
  89       /* Simulate the code in lread.c.  */
  90       /* Allow `\C- ' and `\C-?'.  */
  91       if ((c & 0377) == ' ')
  92         c &= ~0177 & ~ CHAR_CTL;
  93       else if ((c & 0377) == '?')
  94         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  95       /* ASCII control chars are made from letters (both cases),
  96          as well as the non-letters within 0100...0137.  */
  97       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  98         c &= (037 | (~0177 & ~CHAR_CTL));
  99       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101     }
 102 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 103   if (c & CHAR_META)
 104     {
 105       /* Move the meta bit to the right place for a string.  */
 106       c = (c & ~CHAR_META) | 0x80;
 107     }
 108 #endif
 109
 110   return c;
 111 }
 112
 113
 114 /* Store multibyte form of character C at P.  If C has modifier bits,
 115    handle them appropriately.  */
 116
 117 int
 118 char_string (unsigned int c, unsigned char *p)
 119 {
 120   int bytes;
 121
 122   if (c & CHAR_MODIFIER_MASK)
 123     {
 124       c = char_resolve_modifier_mask (c);
 125       /* If C still has any modifier bits, just ignore it.  */
 126       c &= ~CHAR_MODIFIER_MASK;
 127     }
 128
 129   if (c <= MAX_3_BYTE_CHAR)
 130     {
 131       bytes = CHAR_STRING (c, p);
 132     }
 133   else if (c <= MAX_4_BYTE_CHAR)
 134     {
 135       p[0] = (0xF0 | (c >> 18));
 136       p[1] = (0x80 | ((c >> 12) & 0x3F));
 137       p[2] = (0x80 | ((c >> 6) & 0x3F));
 138       p[3] = (0x80 | (c & 0x3F));
 139       bytes = 4;
 140     }
 141   else if (c <= MAX_5_BYTE_CHAR)
 142     {
 143       p[0] = 0xF8;
 144       p[1] = (0x80 | ((c >> 18) & 0x0F));
 145       p[2] = (0x80 | ((c >> 12) & 0x3F));
 146       p[3] = (0x80 | ((c >> 6) & 0x3F));
 147       p[4] = (0x80 | (c & 0x3F));
 148       bytes = 5;
 149     }
 150   else if (c <= MAX_CHAR)
 151     {
 152       c = CHAR_TO_BYTE8 (c);
 153       bytes = BYTE8_STRING (c, p);
 154     }
 155   else
 156     error ("Invalid character: %x", c);
 157
 158   return bytes;
 159 }
 160
 161
 162 /* Return a character whose multibyte form is at P.  If LEN is not
 163    NULL, it must be a pointer to integer.  In that case, set *LEN to
 164    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 165    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 166    the ending address (i.e., the starting address of the next
 167    character) of the multibyte form.  */
 168
 169 int
 170 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 171 {
 172   int c;
 173   const unsigned char *saved_p = p;
 174
 175   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 176     {
 177       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 178       c = STRING_CHAR_ADVANCE (p);
 179     }
 180   else if (! (*p & 0x08))
 181     {
 182       /* A 4-byte sequence of this form:
 183          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 184       c = ((((p)[0] & 0x7) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       /* A 5-byte sequence of this form:
 193
 194          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 195
 196          Note that the top 4 `x's are always 0, so shifting p[1] can
 197          never exceed the maximum valid character codepoint. */
 198       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 199            (((p)[1] & 0x3F) << 18)
 200            | (((p)[2] & 0x3F) << 12)
 201            | (((p)[3] & 0x3F) << 6)
 202            | ((p)[4] & 0x3F));
 203       p += 5;
 204     }
 205
 206   if (len)
 207     *len = p - saved_p;
 208   if (advanced)
 209     *advanced = p;
 210   return c;
 211 }
 212
 213
 214 /* Translate character C by translation table TABLE.  If no translation is
 215    found in TABLE, return the untranslated character.  If TABLE is a list,
 216    elements are char tables.  In that case, recursively translate C by all the
 217    tables in the list.  */
 218
 219 int
 220 translate_char (Lisp_Object table, int c)
 221 {
 222   if (CHAR_TABLE_P (table))
 223     {
 224       Lisp_Object ch;
 225
 226       ch = CHAR_TABLE_REF (table, c);
 227       if (CHARACTERP (ch))
 228         c = XINT (ch);
 229     }
 230   else
 231     {
 232       for (; CONSP (table); table = XCDR (table))
 233         c = translate_char (XCAR (table), c);
 234     }
 235   return c;
 236 }
 237
 238 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 239    them, return (C & 0xFF).  */
 240
 241 int
 242 multibyte_char_to_unibyte (int c)
 243 {
 244   if (c < 0x80)
 245     return c;
 246   if (CHAR_BYTE8_P (c))
 247     return CHAR_TO_BYTE8 (c);
 248   return (c & 0xFF);
 249 }
 250
 251 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 252    by charset_unibyte.  */
 253
 254 int
 255 multibyte_char_to_unibyte_safe (int c)
 256 {
 257   if (c < 0x80)
 258     return c;
 259   if (CHAR_BYTE8_P (c))
 260     return CHAR_TO_BYTE8 (c);
 261   return -1;
 262 }
 263
 264 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 265        doc: /* Return non-nil if OBJECT is a character.
 266 In Emacs Lisp, characters are represented by character codes, which
 267 are non-negative integers.  The function `max-char' returns the
 268 maximum character code.
 269 usage: (characterp OBJECT)  */)
 270   (Lisp_Object object, Lisp_Object ignore)
 271 {
 272   return (CHARACTERP (object) ? Qt : Qnil);
 273 }
 274
 275 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 276        doc: /* Return the character of the maximum code.  */)
 277   (void)
 278 {
 279   return make_number (MAX_CHAR);
 280 }
 281
 282 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 283        Sunibyte_char_to_multibyte, 1, 1, 0,
 284        doc: /* Convert the byte CH to multibyte character.  */)
 285   (Lisp_Object ch)
 286 {
 287   int c;
 288
 289   CHECK_CHARACTER (ch);
 290   c = XFASTINT (ch);
 291   if (c >= 0x100)
 292     error ("Not a unibyte character: %d", c);
 293   MAKE_CHAR_MULTIBYTE (c);
 294   return make_number (c);
 295 }
 296
 297 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 298        Smultibyte_char_to_unibyte, 1, 1, 0,
 299        doc: /* Convert the multibyte character CH to a byte.
 300 If the multibyte character does not represent a byte, return -1.  */)
 301   (Lisp_Object ch)
 302 {
 303   int cm;
 304
 305   CHECK_CHARACTER (ch);
 306   cm = XFASTINT (ch);
 307   if (cm < 256)
 308     /* Can't distinguish a byte read from a unibyte buffer from
 309        a latin1 char, so let's let it slide.  */
 310     return ch;
 311   else
 312     {
 313       int cu = CHAR_TO_BYTE_SAFE (cm);
 314       return make_number (cu);
 315     }
 316 }
 317
 318
 319 /* Return width (columns) of C considering the buffer display table DP. */
 320
 321 static ptrdiff_t
 322 char_width (int c, struct Lisp_Char_Table *dp)
 323 {
 324   ptrdiff_t width = CHAR_WIDTH (c);
 325
 326   if (dp)
 327     {
 328       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 329       int i;
 330
 331       if (VECTORP (disp))
 332         for (i = 0, width = 0; i < ASIZE (disp); i++)
 333           {
 334             ch = AREF (disp, i);
 335             if (CHARACTERP (ch))
 336               {
 337                 int w = CHAR_WIDTH (XFASTINT (ch));
 338                 if (INT_ADD_OVERFLOW (width, w))
 339                   string_overflow ();
 340                 width += w;
 341               }
 342           }
 343     }
 344   return width;
 345 }
 346
 347
 348 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 349        doc: /* Return width of CHAR when displayed in the current buffer.
 350 The width is measured by how many columns it occupies on the screen.
 351 Tab is taken to occupy `tab-width' columns.
 352 usage: (char-width CHAR)  */)
 353   (Lisp_Object ch)
 354 {
 355   int c;
 356   ptrdiff_t width;
 357
 358   CHECK_CHARACTER (ch);
 359   c = XINT (ch);
 360   width = char_width (c, buffer_display_table ());
 361   return make_number (width);
 362 }
 363
 364 /* Return width of string STR of length LEN when displayed in the
 365    current buffer.  The width is measured by how many columns it
 366    occupies on the screen.  If PRECISION > 0, return the width of
 367    longest substring that doesn't exceed PRECISION, and set number of
 368    characters and bytes of the substring in *NCHARS and *NBYTES
 369    respectively.  */
 370
 371 ptrdiff_t
 372 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 373                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 374 {
 375   ptrdiff_t i = 0, i_byte = 0;
 376   ptrdiff_t width = 0;
 377   struct Lisp_Char_Table *dp = buffer_display_table ();
 378
 379   while (i_byte < len)
 380     {
 381       int bytes;
 382       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 383       ptrdiff_t thiswidth = char_width (c, dp);
 384
 385       if (precision <= 0)
 386         {
 387           if (INT_ADD_OVERFLOW (width, thiswidth))
 388             string_overflow ();
 389         }
 390       else if (precision - width < thiswidth)
 391         {
 392           *nchars = i;
 393           *nbytes = i_byte;
 394           return width;
 395         }
 396       i++;
 397       i_byte += bytes;
 398       width += thiswidth;
 399   }
 400
 401   if (precision > 0)
 402     {
 403       *nchars = i;
 404       *nbytes = i_byte;
 405     }
 406
 407   return width;
 408 }
 409
 410 /* Return width of string STR of length LEN when displayed in the
 411    current buffer.  The width is measured by how many columns it
 412    occupies on the screen.  */
 413
 414 ptrdiff_t
 415 strwidth (const char *str, ptrdiff_t len)
 416 {
 417   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 418 }
 419
 420 /* Return width of Lisp string STRING when displayed in the current
 421    buffer.  The width is measured by how many columns it occupies on
 422    the screen while paying attention to compositions.  If PRECISION >
 423    0, return the width of longest substring that doesn't exceed
 424    PRECISION, and set number of characters and bytes of the substring
 425    in *NCHARS and *NBYTES respectively.  */
 426
 427 ptrdiff_t
 428 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 429                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 430 {
 431   ptrdiff_t len = SCHARS (string);
 432   /* This set multibyte to 0 even if STRING is multibyte when it
 433      contains only ascii and eight-bit-graphic, but that's
 434      intentional.  */
 435   bool multibyte = len < SBYTES (string);
 436   unsigned char *str = SDATA (string);
 437   ptrdiff_t i = 0, i_byte = 0;
 438   ptrdiff_t width = 0;
 439   struct Lisp_Char_Table *dp = buffer_display_table ();
 440
 441   while (i < len)
 442     {
 443       ptrdiff_t chars, bytes, thiswidth;
 444       Lisp_Object val;
 445       ptrdiff_t cmp_id;
 446       ptrdiff_t ignore, end;
 447
 448       if (find_composition (i, -1, &ignore, &end, &val, string)
 449           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 450               >= 0))
 451         {
 452           thiswidth = composition_table[cmp_id]->width;
 453           chars = end - i;
 454           bytes = string_char_to_byte (string, end) - i_byte;
 455         }
 456       else
 457         {
 458           int c;
 459
 460           if (multibyte)
 461             {
 462               int cbytes;
 463               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 464               bytes = cbytes;
 465             }
 466           else
 467             c = str[i_byte], bytes = 1;
 468           chars = 1;
 469           thiswidth = char_width (c, dp);
 470         }
 471
 472       if (precision <= 0)
 473         {
 474 #ifdef emacs
 475           if (INT_ADD_OVERFLOW (width, thiswidth))
 476             string_overflow ();
 477 #endif
 478         }
 479       else if (precision - width < thiswidth)
 480         {
 481           *nchars = i;
 482           *nbytes = i_byte;
 483           return width;
 484         }
 485       i += chars;
 486       i_byte += bytes;
 487       width += thiswidth;
 488     }
 489
 490   if (precision > 0)
 491     {
 492       *nchars = i;
 493       *nbytes = i_byte;
 494     }
 495
 496   return width;
 497 }
 498
 499 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 500        doc: /* Return width of STRING when displayed in the current buffer.
 501 Width is measured by how many columns it occupies on the screen.
 502 When calculating width of a multibyte character in STRING,
 503 only the base leading-code is considered; the validity of
 504 the following bytes is not checked.  Tabs in STRING are always
 505 taken to occupy `tab-width' columns.
 506 usage: (string-width STRING)  */)
 507   (Lisp_Object str)
 508 {
 509   Lisp_Object val;
 510
 511   CHECK_STRING (str);
 512   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 513   return val;
 514 }
 515
 516 /* Return the number of characters in the NBYTES bytes at PTR.
 517    This works by looking at the contents and checking for multibyte
 518    sequences while assuming that there's no invalid sequence.
 519    However, if the current buffer has enable-multibyte-characters =
 520    nil, we treat each byte as a character.  */
 521
 522 ptrdiff_t
 523 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 524 {
 525   /* current_buffer is null at early stages of Emacs initialization.  */
 526   if (current_buffer == 0
 527       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 528     return nbytes;
 529
 530   return multibyte_chars_in_text (ptr, nbytes);
 531 }
 532
 533 /* Return the number of characters in the NBYTES bytes at PTR.
 534    This works by looking at the contents and checking for multibyte
 535    sequences while assuming that there's no invalid sequence.  It
 536    ignores enable-multibyte-characters.  */
 537
 538 ptrdiff_t
 539 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 540 {
 541   const unsigned char *endp = ptr + nbytes;
 542   ptrdiff_t chars = 0;
 543
 544   while (ptr < endp)
 545     {
 546       int len = MULTIBYTE_LENGTH (ptr, endp);
 547
 548       if (len == 0)
 549         emacs_abort ();
 550       ptr += len;
 551       chars++;
 552     }
 553
 554   return chars;
 555 }
 556
 557 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 558    characters and bytes in it, and store them in *NCHARS and *NBYTES
 559    respectively.  On counting bytes, pay attention to that 8-bit
 560    characters not constructing a valid multibyte sequence are
 561    represented by 2-byte in a multibyte text.  */
 562
 563 void
 564 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 565                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 566 {
 567   const unsigned char *endp = str + len;
 568   int n;
 569   ptrdiff_t chars = 0, bytes = 0;
 570
 571   if (len >= MAX_MULTIBYTE_LENGTH)
 572     {
 573       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 574       while (str < adjusted_endp)
 575         {
 576           if (! CHAR_BYTE8_HEAD_P (*str)
 577               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 578             str += n, bytes += n;
 579           else
 580             str++, bytes += 2;
 581           chars++;
 582         }
 583     }
 584   while (str < endp)
 585     {
 586       if (! CHAR_BYTE8_HEAD_P (*str)
 587           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 588         str += n, bytes += n;
 589       else
 590         str++, bytes += 2;
 591       chars++;
 592     }
 593
 594   *nchars = chars;
 595   *nbytes = bytes;
 596   return;
 597 }
 598
 599 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 600    It actually converts only such 8-bit characters that don't construct
 601    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 602    NCHARS is nonzero, set *NCHARS to the number of characters in the
 603    text.  It is assured that we can use LEN bytes at STR as a work
 604    area and that is enough.  Return the number of bytes of the
 605    resulting text.  */
 606
 607 ptrdiff_t
 608 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 609                   ptrdiff_t *nchars)
 610 {
 611   unsigned char *p = str, *endp = str + nbytes;
 612   unsigned char *to;
 613   ptrdiff_t chars = 0;
 614   int n;
 615
 616   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 617     {
 618       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 619       while (p < adjusted_endp
 620              && ! CHAR_BYTE8_HEAD_P (*p)
 621              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 622         p += n, chars++;
 623     }
 624   while (p < endp
 625          && ! CHAR_BYTE8_HEAD_P (*p)
 626          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 627     p += n, chars++;
 628   if (nchars)
 629     *nchars = chars;
 630   if (p == endp)
 631     return nbytes;
 632
 633   to = p;
 634   nbytes = endp - p;
 635   endp = str + len;
 636   memmove (endp - nbytes, p, nbytes);
 637   p = endp - nbytes;
 638
 639   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 640     {
 641       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 642       while (p < adjusted_endp)
 643         {
 644           if (! CHAR_BYTE8_HEAD_P (*p)
 645               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 646             {
 647               while (n--)
 648                 *to++ = *p++;
 649             }
 650           else
 651             {
 652               int c = *p++;
 653               c = BYTE8_TO_CHAR (c);
 654               to += CHAR_STRING (c, to);
 655             }
 656         }
 657       chars++;
 658     }
 659   while (p < endp)
 660     {
 661       if (! CHAR_BYTE8_HEAD_P (*p)
 662           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 663         {
 664           while (n--)
 665             *to++ = *p++;
 666         }
 667       else
 668         {
 669           int c = *p++;
 670           c = BYTE8_TO_CHAR (c);
 671           to += CHAR_STRING (c, to);
 672         }
 673       chars++;
 674     }
 675   if (nchars)
 676     *nchars = chars;
 677   return (to - str);
 678 }
 679
 680 /* Parse unibyte string at STR of LEN bytes, and return the number of
 681    bytes it may occupy when converted to multibyte string by
 682    `str_to_multibyte'.  */
 683
 684 ptrdiff_t
 685 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 686 {
 687   const unsigned char *endp = str + len;
 688   ptrdiff_t bytes;
 689
 690   for (bytes = 0; str < endp; str++)
 691     {
 692       int n = *str < 0x80 ? 1 : 2;
 693       if (INT_ADD_OVERFLOW (bytes, n))
 694         string_overflow ();
 695       bytes += n;
 696     }
 697   return bytes;
 698 }
 699
 700
 701 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 702    that contains the same single-byte characters.  It actually
 703    converts all 8-bit characters to multibyte forms.  It is assured
 704    that we can use LEN bytes at STR as a work area and that is
 705    enough.  */
 706
 707 ptrdiff_t
 708 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 709 {
 710   unsigned char *p = str, *endp = str + bytes;
 711   unsigned char *to;
 712
 713   while (p < endp && *p < 0x80) p++;
 714   if (p == endp)
 715     return bytes;
 716   to = p;
 717   bytes = endp - p;
 718   endp = str + len;
 719   memmove (endp - bytes, p, bytes);
 720   p = endp - bytes;
 721   while (p < endp)
 722     {
 723       int c = *p++;
 724
 725       if (c >= 0x80)
 726         c = BYTE8_TO_CHAR (c);
 727       to += CHAR_STRING (c, to);
 728     }
 729   return (to - str);
 730 }
 731
 732 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 733    actually converts characters in the range 0x80..0xFF to
 734    unibyte.  */
 735
 736 ptrdiff_t
 737 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 738 {
 739   const unsigned char *p = str, *endp = str + bytes;
 740   unsigned char *to;
 741   int c, len;
 742
 743   while (p < endp)
 744     {
 745       c = *p;
 746       len = BYTES_BY_CHAR_HEAD (c);
 747       if (CHAR_BYTE8_HEAD_P (c))
 748         break;
 749       p += len;
 750     }
 751   to = str + (p - str);
 752   while (p < endp)
 753     {
 754       c = *p;
 755       len = BYTES_BY_CHAR_HEAD (c);
 756       if (CHAR_BYTE8_HEAD_P (c))
 757         {
 758           c = STRING_CHAR_ADVANCE (p);
 759           *to++ = CHAR_TO_BYTE8 (c);
 760         }
 761       else
 762         {
 763           while (len--) *to++ = *p++;
 764         }
 765     }
 766   return (to - str);
 767 }
 768
 769 /* Convert eight-bit chars in SRC (in multibyte form) to the
 770    corresponding byte and store in DST.  CHARS is the number of
 771    characters in SRC.  The value is the number of bytes stored in DST.
 772    Usually, the value is the same as CHARS, but is less than it if SRC
 773    contains a non-ASCII, non-eight-bit character.  */
 774
 775 ptrdiff_t
 776 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 777 {
 778   ptrdiff_t i;
 779
 780   for (i = 0; i < chars; i++)
 781     {
 782       int c = STRING_CHAR_ADVANCE (src);
 783
 784       if (CHAR_BYTE8_P (c))
 785         c = CHAR_TO_BYTE8 (c);
 786       else if (! ASCII_CHAR_P (c))
 787         return i;
 788       *dst++ = c;
 789     }
 790   return i;
 791 }
 792
 793
 794 static ptrdiff_t
 795 string_count_byte8 (Lisp_Object string)
 796 {
 797   bool multibyte = STRING_MULTIBYTE (string);
 798   ptrdiff_t nbytes = SBYTES (string);
 799   unsigned char *p = SDATA (string);
 800   unsigned char *pend = p + nbytes;
 801   ptrdiff_t count = 0;
 802   int c, len;
 803
 804   if (multibyte)
 805     while (p < pend)
 806       {
 807         c = *p;
 808         len = BYTES_BY_CHAR_HEAD (c);
 809
 810         if (CHAR_BYTE8_HEAD_P (c))
 811           count++;
 812         p += len;
 813       }
 814   else
 815     while (p < pend)
 816       {
 817         if (*p++ >= 0x80)
 818           count++;
 819       }
 820   return count;
 821 }
 822
 823
 824 Lisp_Object
 825 string_escape_byte8 (Lisp_Object string)
 826 {
 827   ptrdiff_t nchars = SCHARS (string);
 828   ptrdiff_t nbytes = SBYTES (string);
 829   bool multibyte = STRING_MULTIBYTE (string);
 830   ptrdiff_t byte8_count;
 831   const unsigned char *src, *src_end;
 832   unsigned char *dst;
 833   Lisp_Object val;
 834   int c, len;
 835
 836   if (multibyte && nchars == nbytes)
 837     return string;
 838
 839   byte8_count = string_count_byte8 (string);
 840
 841   if (byte8_count == 0)
 842     return string;
 843
 844   if (multibyte)
 845     {
 846       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 847           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 848         string_overflow ();
 849
 850       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 851       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 852                                           nbytes + byte8_count * 2);
 853     }
 854   else
 855     {
 856       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 857         string_overflow ();
 858
 859       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 860       val = make_uninit_string (nbytes + byte8_count * 3);
 861     }
 862
 863   src = SDATA (string);
 864   src_end = src + nbytes;
 865   dst = SDATA (val);
 866   if (multibyte)
 867     while (src < src_end)
 868       {
 869         c = *src;
 870         len = BYTES_BY_CHAR_HEAD (c);
 871
 872         if (CHAR_BYTE8_HEAD_P (c))
 873           {
 874             c = STRING_CHAR_ADVANCE (src);
 875             c = CHAR_TO_BYTE8 (c);
 876             dst += sprintf ((char *) dst, "\\%03o", c);
 877           }
 878         else
 879           while (len--) *dst++ = *src++;
 880       }
 881   else
 882     while (src < src_end)
 883       {
 884         c = *src++;
 885         if (c >= 0x80)
 886           dst += sprintf ((char *) dst, "\\%03o", c);
 887         else
 888           *dst++ = c;
 889       }
 890   return val;
 891 }
 892
 893 \f
 894 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 895        doc: /*
 896 Concatenate all the argument characters and make the result a string.
 897 usage: (string &rest CHARACTERS)  */)
 898   (ptrdiff_t n, Lisp_Object *args)
 899 {
 900   ptrdiff_t i;
 901   int c;
 902   unsigned char *buf, *p;
 903   Lisp_Object str;
 904   USE_SAFE_ALLOCA;
 905
 906   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 907   p = buf;
 908
 909   for (i = 0; i < n; i++)
 910     {
 911       CHECK_CHARACTER (args[i]);
 912       c = XINT (args[i]);
 913       p += CHAR_STRING (c, p);
 914     }
 915
 916   str = make_string_from_bytes ((char *) buf, n, p - buf);
 917   SAFE_FREE ();
 918   return str;
 919 }
 920
 921 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 922        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 923 usage: (unibyte-string &rest BYTES)  */)
 924   (ptrdiff_t n, Lisp_Object *args)
 925 {
 926   ptrdiff_t i;
 927   Lisp_Object str;
 928   USE_SAFE_ALLOCA;
 929   unsigned char *buf = SAFE_ALLOCA (n);
 930   unsigned char *p = buf;
 931
 932   for (i = 0; i < n; i++)
 933     {
 934       CHECK_RANGED_INTEGER (args[i], 0, 255);
 935       *p++ = XINT (args[i]);
 936     }
 937
 938   str = make_string_from_bytes ((char *) buf, n, p - buf);
 939   SAFE_FREE ();
 940   return str;
 941 }
 942
 943 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 944        Schar_resolve_modifiers, 1, 1, 0,
 945        doc: /* Resolve modifiers in the character CHAR.
 946 The value is a character with modifiers resolved into the character
 947 code.  Unresolved modifiers are kept in the value.
 948 usage: (char-resolve-modifiers CHAR)  */)
 949   (Lisp_Object character)
 950 {
 951   EMACS_INT c;
 952
 953   CHECK_NUMBER (character);
 954   c = XINT (character);
 955   return make_number (char_resolve_modifier_mask (c));
 956 }
 957
 958 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 959        doc: /* Return a byte value of a character at point.
 960 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 961 a byte value.
 962 Optional 2nd arg STRING, if non-nil, is a string of which first
 963 character is a target to get a byte value.  In this case, POSITION, if
 964 non-nil, is an index of a target character in the string.
 965
 966 If the current buffer (or STRING) is multibyte, and the target
 967 character is not ASCII nor 8-bit character, an error is signaled.  */)
 968   (Lisp_Object position, Lisp_Object string)
 969 {
 970   int c;
 971   ptrdiff_t pos;
 972   unsigned char *p;
 973
 974   if (NILP (string))
 975     {
 976       if (NILP (position))
 977         {
 978           p = PT_ADDR;
 979         }
 980       else
 981         {
 982           CHECK_NUMBER_COERCE_MARKER (position);
 983           if (XINT (position) < BEGV || XINT (position) >= ZV)
 984             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 985           pos = XFASTINT (position);
 986           p = CHAR_POS_ADDR (pos);
 987         }
 988       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 989         return make_number (*p);
 990     }
 991   else
 992     {
 993       CHECK_STRING (string);
 994       if (NILP (position))
 995         {
 996           p = SDATA (string);
 997         }
 998       else
 999         {
1000           CHECK_NATNUM (position);
1001           if (XINT (position) >= SCHARS (string))
1002             args_out_of_range (string, position);
1003           pos = XFASTINT (position);
1004           p = SDATA (string) + string_char_to_byte (string, pos);
1005         }
1006       if (! STRING_MULTIBYTE (string))
1007         return make_number (*p);
1008     }
1009   c = STRING_CHAR (p);
1010   if (CHAR_BYTE8_P (c))
1011     c = CHAR_TO_BYTE8 (c);
1012   else if (! ASCII_CHAR_P (c))
1013     error ("Not an ASCII nor an 8-bit character: %d", c);
1014   return make_number (c);
1015 }
1016
1017 #ifdef emacs
1018
1019 void
1020 syms_of_character (void)
1021 {
1022   DEFSYM (Qcharacterp, "characterp");
1023   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1024
1025   staticpro (&Vchar_unify_table);
1026   Vchar_unify_table = Qnil;
1027
1028   defsubr (&Smax_char);
1029   defsubr (&Scharacterp);
1030   defsubr (&Sunibyte_char_to_multibyte);
1031   defsubr (&Smultibyte_char_to_unibyte);
1032   defsubr (&Schar_width);
1033   defsubr (&Sstring_width);
1034   defsubr (&Sstring);
1035   defsubr (&Sunibyte_string);
1036   defsubr (&Schar_resolve_modifiers);
1037   defsubr (&Sget_byte);
1038
1039   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1040                doc: /*
1041 Vector recording all translation tables ever defined.
1042 Each element is a pair (SYMBOL . TABLE) relating the table to the
1043 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1044   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1045
1046   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1047                doc: /*
1048 A char-table for characters which invoke auto-filling.
1049 Such characters have value t in this table.  */);
1050   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1051   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1052   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1053
1054   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1055                doc: /*
1056 A char-table for width (columns) of each character.  */);
1057   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1058   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1059   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1060                         make_number (4));
1061
1062   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1063                doc: /* A char-table for each printable character.  */);
1064   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1065   Fset_char_table_range (Vprintable_chars,
1066                          Fcons (make_number (32), make_number (126)), Qt);
1067   Fset_char_table_range (Vprintable_chars,
1068                          Fcons (make_number (160),
1069                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1070
1071   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1072                doc: /* Char table of script symbols.
1073 It has one extra slot whose value is a list of script symbols.  */);
1074
1075   DEFSYM (Qchar_script_table, "char-script-table");
1076   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1077   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1078
1079   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1080                doc: /* Alist of scripts vs the representative characters.
1081 Each element is a cons (SCRIPT . CHARS).
1082 SCRIPT is a symbol representing a script or a subgroup of a script.
1083 CHARS is a list or a vector of characters.
1084 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1085 If it is a vector, one of the characters in the vector is necessary.
1086 This variable is used to find a font for a specific script.  */);
1087   Vscript_representative_chars = Qnil;
1088
1089   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1090                doc: /* Char table of Unicode's "General Category".
1091 All Unicode characters have one of the following values (symbol):
1092   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1093   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1094 See The Unicode Standard for the meaning of those values.  */);
1095   /* The correct char-table is setup in characters.el.  */
1096   Vunicode_category_table = Qnil;
1097 }
1098
1099 #endif /* emacs */