src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2019 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or (at
  15 your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #include <config.h>
  29
  30 #include <stdio.h>
  31
  32 #include <sys/types.h>
  33 #include <intprops.h>
  34 #include "lisp.h"
  35 #include "character.h"
  36 #include "buffer.h"
  37 #include "dispextern.h"
  38 #include "composite.h"
  39 #include "disptab.h"
  40
  41 /* Char-table of information about which character to unify to which
  42    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  43 Lisp_Object Vchar_unify_table;
  44
  45 \f
  46
  47 /* If character code C has modifier masks, reflect them to the
  48    character code if possible.  Return the resulting code.  */
  49
  50 EMACS_INT
  51 char_resolve_modifier_mask (EMACS_INT c)
  52 {
  53   /* A non-ASCII character can't reflect modifier bits to the code.  */
  54   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  55     return c;
  56
  57   /* For Meta, Shift, and Control modifiers, we need special care.  */
  58   if (c & CHAR_SHIFT)
  59     {
  60       /* Shift modifier is valid only with [A-Za-z].  */
  61       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  62         c &= ~CHAR_SHIFT;
  63       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  64         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  65       /* Shift modifier for control characters and SPC is ignored.  */
  66       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  67         c &= ~CHAR_SHIFT;
  68     }
  69   if (c & CHAR_CTL)
  70     {
  71       /* Simulate the code in lread.c.  */
  72       /* Allow `\C- ' and `\C-?'.  */
  73       if ((c & 0377) == ' ')
  74         c &= ~0177 & ~ CHAR_CTL;
  75       else if ((c & 0377) == '?')
  76         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  77       /* ASCII control chars are made from letters (both cases),
  78          as well as the non-letters within 0100...0137.  */
  79       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  80         c &= (037 | (~0177 & ~CHAR_CTL));
  81       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  82         c &= (037 | (~0177 & ~CHAR_CTL));
  83     }
  84 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
  85   if (c & CHAR_META)
  86     {
  87       /* Move the meta bit to the right place for a string.  */
  88       c = (c & ~CHAR_META) | 0x80;
  89     }
  90 #endif
  91
  92   return c;
  93 }
  94
  95
  96 /* Store multibyte form of character C at P.  If C has modifier bits,
  97    handle them appropriately.  */
  98
  99 int
 100 char_string (unsigned int c, unsigned char *p)
 101 {
 102   int bytes;
 103
 104   if (c & CHAR_MODIFIER_MASK)
 105     {
 106       c = char_resolve_modifier_mask (c);
 107       /* If C still has any modifier bits, just ignore it.  */
 108       c &= ~CHAR_MODIFIER_MASK;
 109     }
 110
 111   if (c <= MAX_3_BYTE_CHAR)
 112     {
 113       bytes = CHAR_STRING (c, p);
 114     }
 115   else if (c <= MAX_4_BYTE_CHAR)
 116     {
 117       p[0] = (0xF0 | (c >> 18));
 118       p[1] = (0x80 | ((c >> 12) & 0x3F));
 119       p[2] = (0x80 | ((c >> 6) & 0x3F));
 120       p[3] = (0x80 | (c & 0x3F));
 121       bytes = 4;
 122     }
 123   else if (c <= MAX_5_BYTE_CHAR)
 124     {
 125       p[0] = 0xF8;
 126       p[1] = (0x80 | ((c >> 18) & 0x0F));
 127       p[2] = (0x80 | ((c >> 12) & 0x3F));
 128       p[3] = (0x80 | ((c >> 6) & 0x3F));
 129       p[4] = (0x80 | (c & 0x3F));
 130       bytes = 5;
 131     }
 132   else if (c <= MAX_CHAR)
 133     {
 134       c = CHAR_TO_BYTE8 (c);
 135       bytes = BYTE8_STRING (c, p);
 136     }
 137   else
 138     error ("Invalid character: %x", c);
 139
 140   return bytes;
 141 }
 142
 143
 144 /* Return a character whose multibyte form is at P.  If LEN is not
 145    NULL, it must be a pointer to integer.  In that case, set *LEN to
 146    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 147    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 148    the ending address (i.e., the starting address of the next
 149    character) of the multibyte form.  */
 150
 151 int
 152 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 153 {
 154   int c;
 155   const unsigned char *saved_p = p;
 156
 157   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 158     {
 159       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 160       c = STRING_CHAR_ADVANCE (p);
 161     }
 162   else if (! (*p & 0x08))
 163     {
 164       /* A 4-byte sequence of this form:
 165          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 166       c = ((((p)[0] & 0x7) << 18)
 167            | (((p)[1] & 0x3F) << 12)
 168            | (((p)[2] & 0x3F) << 6)
 169            | ((p)[3] & 0x3F));
 170       p += 4;
 171     }
 172   else
 173     {
 174       /* A 5-byte sequence of this form:
 175
 176          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 177
 178          Note that the top 4 `x's are always 0, so shifting p[1] can
 179          never exceed the maximum valid character codepoint. */
 180       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 181            (((p)[1] & 0x3F) << 18)
 182            | (((p)[2] & 0x3F) << 12)
 183            | (((p)[3] & 0x3F) << 6)
 184            | ((p)[4] & 0x3F));
 185       p += 5;
 186     }
 187
 188   if (len)
 189     *len = p - saved_p;
 190   if (advanced)
 191     *advanced = p;
 192   return c;
 193 }
 194
 195
 196 /* Translate character C by translation table TABLE.  If no translation is
 197    found in TABLE, return the untranslated character.  If TABLE is a list,
 198    elements are char tables.  In that case, recursively translate C by all the
 199    tables in the list.  */
 200
 201 int
 202 translate_char (Lisp_Object table, int c)
 203 {
 204   if (CHAR_TABLE_P (table))
 205     {
 206       Lisp_Object ch;
 207
 208       ch = CHAR_TABLE_REF (table, c);
 209       if (CHARACTERP (ch))
 210         c = XINT (ch);
 211     }
 212   else
 213     {
 214       for (; CONSP (table); table = XCDR (table))
 215         c = translate_char (XCAR (table), c);
 216     }
 217   return c;
 218 }
 219
 220 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 221        doc: /* Return non-nil if OBJECT is a character.
 222 In Emacs Lisp, characters are represented by character codes, which
 223 are non-negative integers.  The function `max-char' returns the
 224 maximum character code.
 225 usage: (characterp OBJECT)  */
 226        attributes: const)
 227   (Lisp_Object object, Lisp_Object ignore)
 228 {
 229   return (CHARACTERP (object) ? Qt : Qnil);
 230 }
 231
 232 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 233        doc: /* Return the character of the maximum code.  */
 234        attributes: const)
 235   (void)
 236 {
 237   return make_number (MAX_CHAR);
 238 }
 239
 240 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 241        Sunibyte_char_to_multibyte, 1, 1, 0,
 242        doc: /* Convert the byte CH to multibyte character.  */)
 243   (Lisp_Object ch)
 244 {
 245   int c;
 246
 247   CHECK_CHARACTER (ch);
 248   c = XFASTINT (ch);
 249   if (c >= 0x100)
 250     error ("Not a unibyte character: %d", c);
 251   MAKE_CHAR_MULTIBYTE (c);
 252   return make_number (c);
 253 }
 254
 255 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 256        Smultibyte_char_to_unibyte, 1, 1, 0,
 257        doc: /* Convert the multibyte character CH to a byte.
 258 If the multibyte character does not represent a byte, return -1.  */)
 259   (Lisp_Object ch)
 260 {
 261   int cm;
 262
 263   CHECK_CHARACTER (ch);
 264   cm = XFASTINT (ch);
 265   if (cm < 256)
 266     /* Can't distinguish a byte read from a unibyte buffer from
 267        a latin1 char, so let's let it slide.  */
 268     return ch;
 269   else
 270     {
 271       int cu = CHAR_TO_BYTE_SAFE (cm);
 272       return make_number (cu);
 273     }
 274 }
 275
 276
 277 /* Return width (columns) of C considering the buffer display table DP. */
 278
 279 static ptrdiff_t
 280 char_width (int c, struct Lisp_Char_Table *dp)
 281 {
 282   ptrdiff_t width = CHARACTER_WIDTH (c);
 283
 284   if (dp)
 285     {
 286       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 287       int i;
 288
 289       if (VECTORP (disp))
 290         for (i = 0, width = 0; i < ASIZE (disp); i++)
 291           {
 292             int c = -1;
 293             ch = AREF (disp, i);
 294             if (GLYPH_CODE_P (ch))
 295               c = GLYPH_CODE_CHAR (ch);
 296             else if (CHARACTERP (ch))
 297               c = XFASTINT (ch);
 298             if (c >= 0)
 299               {
 300                 int w = CHARACTER_WIDTH (c);
 301                 if (INT_ADD_WRAPV (width, w, &width))
 302                   string_overflow ();
 303               }
 304           }
 305     }
 306   return width;
 307 }
 308
 309
 310 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 311        doc: /* Return width of CHAR when displayed in the current buffer.
 312 The width is measured by how many columns it occupies on the screen.
 313 Tab is taken to occupy `tab-width' columns.
 314 usage: (char-width CHAR)  */)
 315   (Lisp_Object ch)
 316 {
 317   int c;
 318   ptrdiff_t width;
 319
 320   CHECK_CHARACTER (ch);
 321   c = XINT (ch);
 322   width = char_width (c, buffer_display_table ());
 323   return make_number (width);
 324 }
 325
 326 /* Return width of string STR of length LEN when displayed in the
 327    current buffer.  The width is measured by how many columns it
 328    occupies on the screen.  If PRECISION > 0, return the width of
 329    longest substring that doesn't exceed PRECISION, and set number of
 330    characters and bytes of the substring in *NCHARS and *NBYTES
 331    respectively.  */
 332
 333 ptrdiff_t
 334 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 335                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 336 {
 337   ptrdiff_t i = 0, i_byte = 0;
 338   ptrdiff_t width = 0;
 339   struct Lisp_Char_Table *dp = buffer_display_table ();
 340
 341   while (i_byte < len)
 342     {
 343       int bytes;
 344       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 345       ptrdiff_t thiswidth = char_width (c, dp);
 346
 347       if (0 < precision && precision - width < thiswidth)
 348         {
 349           *nchars = i;
 350           *nbytes = i_byte;
 351           return width;
 352         }
 353       if (INT_ADD_WRAPV (thiswidth, width, &width))
 354         string_overflow ();
 355       i++;
 356       i_byte += bytes;
 357   }
 358
 359   if (precision > 0)
 360     {
 361       *nchars = i;
 362       *nbytes = i_byte;
 363     }
 364
 365   return width;
 366 }
 367
 368 /* Return width of string STR of length LEN when displayed in the
 369    current buffer.  The width is measured by how many columns it
 370    occupies on the screen.  */
 371
 372 ptrdiff_t
 373 strwidth (const char *str, ptrdiff_t len)
 374 {
 375   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 376 }
 377
 378 /* Return width of Lisp string STRING when displayed in the current
 379    buffer.  The width is measured by how many columns it occupies on
 380    the screen while paying attention to compositions.  If PRECISION >
 381    0, return the width of longest substring that doesn't exceed
 382    PRECISION, and set number of characters and bytes of the substring
 383    in *NCHARS and *NBYTES respectively.  */
 384
 385 ptrdiff_t
 386 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 387                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 388 {
 389   ptrdiff_t len = SCHARS (string);
 390   /* This set multibyte to 0 even if STRING is multibyte when it
 391      contains only ascii and eight-bit-graphic, but that's
 392      intentional.  */
 393   bool multibyte = len < SBYTES (string);
 394   unsigned char *str = SDATA (string);
 395   ptrdiff_t i = 0, i_byte = 0;
 396   ptrdiff_t width = 0;
 397   struct Lisp_Char_Table *dp = buffer_display_table ();
 398
 399   while (i < len)
 400     {
 401       ptrdiff_t chars, bytes, thiswidth;
 402       Lisp_Object val;
 403       ptrdiff_t cmp_id;
 404       ptrdiff_t ignore, end;
 405
 406       if (find_composition (i, -1, &ignore, &end, &val, string)
 407           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 408               >= 0))
 409         {
 410           thiswidth = composition_table[cmp_id]->width;
 411           chars = end - i;
 412           bytes = string_char_to_byte (string, end) - i_byte;
 413         }
 414       else
 415         {
 416           int c;
 417
 418           if (multibyte)
 419             {
 420               int cbytes;
 421               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 422               bytes = cbytes;
 423             }
 424           else
 425             c = str[i_byte], bytes = 1;
 426           chars = 1;
 427           thiswidth = char_width (c, dp);
 428         }
 429
 430       if (0 < precision && precision - width < thiswidth)
 431         {
 432           *nchars = i;
 433           *nbytes = i_byte;
 434           return width;
 435         }
 436       if (INT_ADD_WRAPV (thiswidth, width, &width))
 437         string_overflow ();
 438       i += chars;
 439       i_byte += bytes;
 440     }
 441
 442   if (precision > 0)
 443     {
 444       *nchars = i;
 445       *nbytes = i_byte;
 446     }
 447
 448   return width;
 449 }
 450
 451 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 452        doc: /* Return width of STRING when displayed in the current buffer.
 453 Width is measured by how many columns it occupies on the screen.
 454 When calculating width of a multibyte character in STRING,
 455 only the base leading-code is considered; the validity of
 456 the following bytes is not checked.  Tabs in STRING are always
 457 taken to occupy `tab-width' columns.
 458 usage: (string-width STRING)  */)
 459   (Lisp_Object str)
 460 {
 461   Lisp_Object val;
 462
 463   CHECK_STRING (str);
 464   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 465   return val;
 466 }
 467
 468 /* Return the number of characters in the NBYTES bytes at PTR.
 469    This works by looking at the contents and checking for multibyte
 470    sequences while assuming that there's no invalid sequence.
 471    However, if the current buffer has enable-multibyte-characters =
 472    nil, we treat each byte as a character.  */
 473
 474 ptrdiff_t
 475 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 476 {
 477   /* current_buffer is null at early stages of Emacs initialization.  */
 478   if (current_buffer == 0
 479       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 480     return nbytes;
 481
 482   return multibyte_chars_in_text (ptr, nbytes);
 483 }
 484
 485 /* Return the number of characters in the NBYTES bytes at PTR.
 486    This works by looking at the contents and checking for multibyte
 487    sequences while assuming that there's no invalid sequence.  It
 488    ignores enable-multibyte-characters.  */
 489
 490 ptrdiff_t
 491 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 492 {
 493   const unsigned char *endp = ptr + nbytes;
 494   ptrdiff_t chars = 0;
 495
 496   while (ptr < endp)
 497     {
 498       int len = MULTIBYTE_LENGTH (ptr, endp);
 499
 500       if (len == 0)
 501         emacs_abort ();
 502       ptr += len;
 503       chars++;
 504     }
 505
 506   return chars;
 507 }
 508
 509 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 510    characters and bytes in it, and store them in *NCHARS and *NBYTES
 511    respectively.  On counting bytes, pay attention to that 8-bit
 512    characters not constructing a valid multibyte sequence are
 513    represented by 2-byte in a multibyte text.  */
 514
 515 void
 516 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 517                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 518 {
 519   const unsigned char *endp = str + len;
 520   int n;
 521   ptrdiff_t chars = 0, bytes = 0;
 522
 523   if (len >= MAX_MULTIBYTE_LENGTH)
 524     {
 525       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 526       while (str < adjusted_endp)
 527         {
 528           if (! CHAR_BYTE8_HEAD_P (*str)
 529               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 530             str += n, bytes += n;
 531           else
 532             str++, bytes += 2;
 533           chars++;
 534         }
 535     }
 536   while (str < endp)
 537     {
 538       if (! CHAR_BYTE8_HEAD_P (*str)
 539           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 540         str += n, bytes += n;
 541       else
 542         str++, bytes += 2;
 543       chars++;
 544     }
 545
 546   *nchars = chars;
 547   *nbytes = bytes;
 548   return;
 549 }
 550
 551 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 552    It actually converts only such 8-bit characters that don't construct
 553    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 554    NCHARS is nonzero, set *NCHARS to the number of characters in the
 555    text.  It is assured that we can use LEN bytes at STR as a work
 556    area and that is enough.  Return the number of bytes of the
 557    resulting text.  */
 558
 559 ptrdiff_t
 560 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 561                   ptrdiff_t *nchars)
 562 {
 563   unsigned char *p = str, *endp = str + nbytes;
 564   unsigned char *to;
 565   ptrdiff_t chars = 0;
 566   int n;
 567
 568   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 569     {
 570       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 571       while (p < adjusted_endp
 572              && ! CHAR_BYTE8_HEAD_P (*p)
 573              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 574         p += n, chars++;
 575     }
 576   while (p < endp
 577          && ! CHAR_BYTE8_HEAD_P (*p)
 578          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 579     p += n, chars++;
 580   if (nchars)
 581     *nchars = chars;
 582   if (p == endp)
 583     return nbytes;
 584
 585   to = p;
 586   nbytes = endp - p;
 587   endp = str + len;
 588   memmove (endp - nbytes, p, nbytes);
 589   p = endp - nbytes;
 590
 591   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 592     {
 593       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 594       while (p < adjusted_endp)
 595         {
 596           if (! CHAR_BYTE8_HEAD_P (*p)
 597               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 598             {
 599               while (n--)
 600                 *to++ = *p++;
 601             }
 602           else
 603             {
 604               int c = *p++;
 605               c = BYTE8_TO_CHAR (c);
 606               to += CHAR_STRING (c, to);
 607             }
 608         }
 609       chars++;
 610     }
 611   while (p < endp)
 612     {
 613       if (! CHAR_BYTE8_HEAD_P (*p)
 614           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 615         {
 616           while (n--)
 617             *to++ = *p++;
 618         }
 619       else
 620         {
 621           int c = *p++;
 622           c = BYTE8_TO_CHAR (c);
 623           to += CHAR_STRING (c, to);
 624         }
 625       chars++;
 626     }
 627   if (nchars)
 628     *nchars = chars;
 629   return (to - str);
 630 }
 631
 632 /* Parse unibyte string at STR of LEN bytes, and return the number of
 633    bytes it may occupy when converted to multibyte string by
 634    `str_to_multibyte'.  */
 635
 636 ptrdiff_t
 637 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 638 {
 639   const unsigned char *endp = str + len;
 640   ptrdiff_t bytes;
 641
 642   for (bytes = 0; str < endp; str++)
 643     {
 644       int n = *str < 0x80 ? 1 : 2;
 645       if (INT_ADD_WRAPV (bytes, n, &bytes))
 646         string_overflow ();
 647     }
 648   return bytes;
 649 }
 650
 651
 652 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 653    that contains the same single-byte characters.  It actually
 654    converts all 8-bit characters to multibyte forms.  It is assured
 655    that we can use LEN bytes at STR as a work area and that is
 656    enough.  */
 657
 658 ptrdiff_t
 659 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 660 {
 661   unsigned char *p = str, *endp = str + bytes;
 662   unsigned char *to;
 663
 664   while (p < endp && *p < 0x80) p++;
 665   if (p == endp)
 666     return bytes;
 667   to = p;
 668   bytes = endp - p;
 669   endp = str + len;
 670   memmove (endp - bytes, p, bytes);
 671   p = endp - bytes;
 672   while (p < endp)
 673     {
 674       int c = *p++;
 675
 676       if (c >= 0x80)
 677         c = BYTE8_TO_CHAR (c);
 678       to += CHAR_STRING (c, to);
 679     }
 680   return (to - str);
 681 }
 682
 683 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 684    actually converts characters in the range 0x80..0xFF to
 685    unibyte.  */
 686
 687 ptrdiff_t
 688 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 689 {
 690   const unsigned char *p = str, *endp = str + bytes;
 691   unsigned char *to;
 692   int c, len;
 693
 694   while (p < endp)
 695     {
 696       c = *p;
 697       len = BYTES_BY_CHAR_HEAD (c);
 698       if (CHAR_BYTE8_HEAD_P (c))
 699         break;
 700       p += len;
 701     }
 702   to = str + (p - str);
 703   while (p < endp)
 704     {
 705       c = *p;
 706       len = BYTES_BY_CHAR_HEAD (c);
 707       if (CHAR_BYTE8_HEAD_P (c))
 708         {
 709           c = STRING_CHAR_ADVANCE (p);
 710           *to++ = CHAR_TO_BYTE8 (c);
 711         }
 712       else
 713         {
 714           while (len--) *to++ = *p++;
 715         }
 716     }
 717   return (to - str);
 718 }
 719
 720 /* Convert eight-bit chars in SRC (in multibyte form) to the
 721    corresponding byte and store in DST.  CHARS is the number of
 722    characters in SRC.  The value is the number of bytes stored in DST.
 723    Usually, the value is the same as CHARS, but is less than it if SRC
 724    contains a non-ASCII, non-eight-bit character.  */
 725
 726 ptrdiff_t
 727 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 728 {
 729   ptrdiff_t i;
 730
 731   for (i = 0; i < chars; i++)
 732     {
 733       int c = STRING_CHAR_ADVANCE (src);
 734
 735       if (CHAR_BYTE8_P (c))
 736         c = CHAR_TO_BYTE8 (c);
 737       else if (! ASCII_CHAR_P (c))
 738         return i;
 739       *dst++ = c;
 740     }
 741   return i;
 742 }
 743
 744
 745 static ptrdiff_t
 746 string_count_byte8 (Lisp_Object string)
 747 {
 748   bool multibyte = STRING_MULTIBYTE (string);
 749   ptrdiff_t nbytes = SBYTES (string);
 750   unsigned char *p = SDATA (string);
 751   unsigned char *pend = p + nbytes;
 752   ptrdiff_t count = 0;
 753   int c, len;
 754
 755   if (multibyte)
 756     while (p < pend)
 757       {
 758         c = *p;
 759         len = BYTES_BY_CHAR_HEAD (c);
 760
 761         if (CHAR_BYTE8_HEAD_P (c))
 762           count++;
 763         p += len;
 764       }
 765   else
 766     while (p < pend)
 767       {
 768         if (*p++ >= 0x80)
 769           count++;
 770       }
 771   return count;
 772 }
 773
 774
 775 Lisp_Object
 776 string_escape_byte8 (Lisp_Object string)
 777 {
 778   ptrdiff_t nchars = SCHARS (string);
 779   ptrdiff_t nbytes = SBYTES (string);
 780   bool multibyte = STRING_MULTIBYTE (string);
 781   ptrdiff_t byte8_count;
 782   ptrdiff_t thrice_byte8_count, uninit_nchars, uninit_nbytes;
 783   const unsigned char *src, *src_end;
 784   unsigned char *dst;
 785   Lisp_Object val;
 786   int c, len;
 787
 788   if (multibyte && nchars == nbytes)
 789     return string;
 790
 791   byte8_count = string_count_byte8 (string);
 792
 793   if (byte8_count == 0)
 794     return string;
 795
 796   if (INT_MULTIPLY_WRAPV (byte8_count, 3, &thrice_byte8_count))
 797     string_overflow ();
 798
 799   if (multibyte)
 800     {
 801       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 802       if (INT_ADD_WRAPV (nchars, thrice_byte8_count, &uninit_nchars)
 803           || INT_ADD_WRAPV (nbytes, 2 * byte8_count, &uninit_nbytes))
 804         string_overflow ();
 805       val = make_uninit_multibyte_string (uninit_nchars, uninit_nbytes);
 806     }
 807   else
 808     {
 809       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 810       if (INT_ADD_WRAPV (thrice_byte8_count, nbytes, &uninit_nbytes))
 811         string_overflow ();
 812       val = make_uninit_string (uninit_nbytes);
 813     }
 814
 815   src = SDATA (string);
 816   src_end = src + nbytes;
 817   dst = SDATA (val);
 818   if (multibyte)
 819     while (src < src_end)
 820       {
 821         c = *src;
 822         len = BYTES_BY_CHAR_HEAD (c);
 823
 824         if (CHAR_BYTE8_HEAD_P (c))
 825           {
 826             c = STRING_CHAR_ADVANCE (src);
 827             c = CHAR_TO_BYTE8 (c);
 828             dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 829           }
 830         else
 831           while (len--) *dst++ = *src++;
 832       }
 833   else
 834     while (src < src_end)
 835       {
 836         c = *src++;
 837         if (c >= 0x80)
 838           dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 839         else
 840           *dst++ = c;
 841       }
 842   return val;
 843 }
 844
 845 \f
 846 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 847        doc: /*
 848 Concatenate all the argument characters and make the result a string.
 849 usage: (string &rest CHARACTERS)  */)
 850   (ptrdiff_t n, Lisp_Object *args)
 851 {
 852   ptrdiff_t i;
 853   int c;
 854   unsigned char *buf, *p;
 855   Lisp_Object str;
 856   USE_SAFE_ALLOCA;
 857
 858   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 859   p = buf;
 860
 861   for (i = 0; i < n; i++)
 862     {
 863       CHECK_CHARACTER (args[i]);
 864       c = XINT (args[i]);
 865       p += CHAR_STRING (c, p);
 866     }
 867
 868   str = make_string_from_bytes ((char *) buf, n, p - buf);
 869   SAFE_FREE ();
 870   return str;
 871 }
 872
 873 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 874        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 875 usage: (unibyte-string &rest BYTES)  */)
 876   (ptrdiff_t n, Lisp_Object *args)
 877 {
 878   ptrdiff_t i;
 879   Lisp_Object str;
 880   USE_SAFE_ALLOCA;
 881   unsigned char *buf = SAFE_ALLOCA (n);
 882   unsigned char *p = buf;
 883
 884   for (i = 0; i < n; i++)
 885     {
 886       CHECK_RANGED_INTEGER (args[i], 0, 255);
 887       *p++ = XINT (args[i]);
 888     }
 889
 890   str = make_string_from_bytes ((char *) buf, n, p - buf);
 891   SAFE_FREE ();
 892   return str;
 893 }
 894
 895 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 896        Schar_resolve_modifiers, 1, 1, 0,
 897        doc: /* Resolve modifiers in the character CHAR.
 898 The value is a character with modifiers resolved into the character
 899 code.  Unresolved modifiers are kept in the value.
 900 usage: (char-resolve-modifiers CHAR)  */)
 901   (Lisp_Object character)
 902 {
 903   EMACS_INT c;
 904
 905   CHECK_NUMBER (character);
 906   c = XINT (character);
 907   return make_number (char_resolve_modifier_mask (c));
 908 }
 909
 910 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 911        doc: /* Return a byte value of a character at point.
 912 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 913 a byte value.
 914 Optional 2nd arg STRING, if non-nil, is a string of which first
 915 character is a target to get a byte value.  In this case, POSITION, if
 916 non-nil, is an index of a target character in the string.
 917
 918 If the current buffer (or STRING) is multibyte, and the target
 919 character is not ASCII nor 8-bit character, an error is signaled.  */)
 920   (Lisp_Object position, Lisp_Object string)
 921 {
 922   int c;
 923   ptrdiff_t pos;
 924   unsigned char *p;
 925
 926   if (NILP (string))
 927     {
 928       if (NILP (position))
 929         {
 930           p = PT_ADDR;
 931         }
 932       else
 933         {
 934           CHECK_NUMBER_COERCE_MARKER (position);
 935           if (XINT (position) < BEGV || XINT (position) >= ZV)
 936             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 937           pos = XFASTINT (position);
 938           p = CHAR_POS_ADDR (pos);
 939         }
 940       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 941         return make_number (*p);
 942     }
 943   else
 944     {
 945       CHECK_STRING (string);
 946       if (NILP (position))
 947         {
 948           p = SDATA (string);
 949         }
 950       else
 951         {
 952           CHECK_NATNUM (position);
 953           if (XINT (position) >= SCHARS (string))
 954             args_out_of_range (string, position);
 955           pos = XFASTINT (position);
 956           p = SDATA (string) + string_char_to_byte (string, pos);
 957         }
 958       if (! STRING_MULTIBYTE (string))
 959         return make_number (*p);
 960     }
 961   c = STRING_CHAR (p);
 962   if (CHAR_BYTE8_P (c))
 963     c = CHAR_TO_BYTE8 (c);
 964   else if (! ASCII_CHAR_P (c))
 965     error ("Not an ASCII nor an 8-bit character: %d", c);
 966   return make_number (c);
 967 }
 968
 969 /* Return true if C is an alphabetic character.  */
 970 bool
 971 alphabeticp (int c)
 972 {
 973   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 974   if (! INTEGERP (category))
 975     return false;
 976   EMACS_INT gen_cat = XINT (category);
 977
 978   /* See UTS #18.  There are additional characters that should be
 979      here, those designated as Other_uppercase, Other_lowercase,
 980      and Other_alphabetic; FIXME.  */
 981   return (gen_cat == UNICODE_CATEGORY_Lu
 982           || gen_cat == UNICODE_CATEGORY_Ll
 983           || gen_cat == UNICODE_CATEGORY_Lt
 984           || gen_cat == UNICODE_CATEGORY_Lm
 985           || gen_cat == UNICODE_CATEGORY_Lo
 986           || gen_cat == UNICODE_CATEGORY_Mn
 987           || gen_cat == UNICODE_CATEGORY_Mc
 988           || gen_cat == UNICODE_CATEGORY_Me
 989           || gen_cat == UNICODE_CATEGORY_Nl);
 990 }
 991
 992 /* Return true if C is an alphabetic or decimal-number character.  */
 993 bool
 994 alphanumericp (int c)
 995 {
 996   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 997   if (! INTEGERP (category))
 998     return false;
 999   EMACS_INT gen_cat = XINT (category);
1000
1001   /* See UTS #18.  Same comment as for alphabeticp applies.  FIXME. */
1002   return (gen_cat == UNICODE_CATEGORY_Lu
1003           || gen_cat == UNICODE_CATEGORY_Ll
1004           || gen_cat == UNICODE_CATEGORY_Lt
1005           || gen_cat == UNICODE_CATEGORY_Lm
1006           || gen_cat == UNICODE_CATEGORY_Lo
1007           || gen_cat == UNICODE_CATEGORY_Mn
1008           || gen_cat == UNICODE_CATEGORY_Mc
1009           || gen_cat == UNICODE_CATEGORY_Me
1010           || gen_cat == UNICODE_CATEGORY_Nl
1011           || gen_cat == UNICODE_CATEGORY_Nd);
1012 }
1013
1014 /* Return true if C is a graphic character.  */
1015 bool
1016 graphicp (int c)
1017 {
1018   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1019   if (! INTEGERP (category))
1020     return false;
1021   EMACS_INT gen_cat = XINT (category);
1022
1023   /* See UTS #18.  */
1024   return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
1025             || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
1026             || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
1027             || gen_cat == UNICODE_CATEGORY_Cc /* control */
1028             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1029             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1030 }
1031
1032 /* Return true if C is a printable character.  */
1033 bool
1034 printablep (int c)
1035 {
1036   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1037   if (! INTEGERP (category))
1038     return false;
1039   EMACS_INT gen_cat = XINT (category);
1040
1041   /* See UTS #18.  */
1042   return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
1043             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1044             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1045 }
1046
1047 /* Return true if C is a horizontal whitespace character, as defined
1048    by http://www.unicode.org/reports/tr18/tr18-19.html#blank.  */
1049 bool
1050 blankp (int c)
1051 {
1052   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1053   if (! INTEGERP (category))
1054     return false;
1055
1056   return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
1057 }
1058
1059 signed char HEXDIGIT_CONST hexdigit[UCHAR_MAX + 1] =
1060   {
1061 #if HEXDIGIT_IS_CONST
1062     [0 ... UCHAR_MAX] = -1,
1063 #endif
1064     ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4,
1065     ['5'] = 5, ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9,
1066     ['A'] = 10, ['B'] = 11, ['C'] = 12, ['D'] = 13, ['E'] = 14, ['F'] = 15,
1067     ['a'] = 10, ['b'] = 11, ['c'] = 12, ['d'] = 13, ['e'] = 14, ['f'] = 15
1068   };
1069
1070 void
1071 syms_of_character (void)
1072 {
1073 #if !HEXDIGIT_IS_CONST
1074   /* Set the non-hex digit values to -1.  */
1075   for (int i = 0; i <= UCHAR_MAX; i++)
1076     hexdigit[i] -= i != '0' && !hexdigit[i];
1077 #endif
1078
1079   DEFSYM (Qcharacterp, "characterp");
1080   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1081
1082   staticpro (&Vchar_unify_table);
1083   Vchar_unify_table = Qnil;
1084
1085   defsubr (&Smax_char);
1086   defsubr (&Scharacterp);
1087   defsubr (&Sunibyte_char_to_multibyte);
1088   defsubr (&Smultibyte_char_to_unibyte);
1089   defsubr (&Schar_width);
1090   defsubr (&Sstring_width);
1091   defsubr (&Sstring);
1092   defsubr (&Sunibyte_string);
1093   defsubr (&Schar_resolve_modifiers);
1094   defsubr (&Sget_byte);
1095
1096   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1097                doc: /*
1098 Vector recording all translation tables ever defined.
1099 Each element is a pair (SYMBOL . TABLE) relating the table to the
1100 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1101   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1102
1103   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1104                doc: /*
1105 A char-table for characters which invoke auto-filling.
1106 Such characters have value t in this table.  */);
1107   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1108   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1109   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1110
1111   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1112                doc: /*
1113 A char-table for width (columns) of each character.  */);
1114   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1115   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1116   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1117                         make_number (4));
1118
1119   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1120                doc: /* A char-table for each printable character.  */);
1121   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1122   Fset_char_table_range (Vprintable_chars,
1123                          Fcons (make_number (32), make_number (126)), Qt);
1124   Fset_char_table_range (Vprintable_chars,
1125                          Fcons (make_number (160),
1126                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1127
1128   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1129                doc: /* Char table of script symbols.
1130 It has one extra slot whose value is a list of script symbols.  */);
1131
1132   DEFSYM (Qchar_script_table, "char-script-table");
1133   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1134   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1135
1136   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1137                doc: /* Alist of scripts vs the representative characters.
1138 Each element is a cons (SCRIPT . CHARS).
1139 SCRIPT is a symbol representing a script or a subgroup of a script.
1140 CHARS is a list or a vector of characters.
1141 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1142 If it is a vector, one of the characters in the vector is necessary.
1143 This variable is used to find a font for a specific script.  */);
1144   Vscript_representative_chars = Qnil;
1145
1146   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1147                doc: /* Char table of Unicode's "General Category".
1148 All Unicode characters have one of the following values (symbol):
1149   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1150   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1151 See The Unicode Standard for the meaning of those values.  */);
1152   /* The correct char-table is setup in characters.el.  */
1153   Vunicode_category_table = Qnil;
1154 }