src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2018 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or (at
  15 your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #include <config.h>
  29
  30 #include <stdio.h>
  31
  32 #include <sys/types.h>
  33 #include <intprops.h>
  34 #include "lisp.h"
  35 #include "character.h"
  36 #include "buffer.h"
  37 #include "composite.h"
  38 #include "disptab.h"
  39
  40 /* Char-table of information about which character to unify to which
  41    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  42 Lisp_Object Vchar_unify_table;
  43
  44 \f
  45
  46 /* If character code C has modifier masks, reflect them to the
  47    character code if possible.  Return the resulting code.  */
  48
  49 EMACS_INT
  50 char_resolve_modifier_mask (EMACS_INT c)
  51 {
  52   /* A non-ASCII character can't reflect modifier bits to the code.  */
  53   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  54     return c;
  55
  56   /* For Meta, Shift, and Control modifiers, we need special care.  */
  57   if (c & CHAR_SHIFT)
  58     {
  59       /* Shift modifier is valid only with [A-Za-z].  */
  60       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  61         c &= ~CHAR_SHIFT;
  62       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  63         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  64       /* Shift modifier for control characters and SPC is ignored.  */
  65       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  66         c &= ~CHAR_SHIFT;
  67     }
  68   if (c & CHAR_CTL)
  69     {
  70       /* Simulate the code in lread.c.  */
  71       /* Allow `\C- ' and `\C-?'.  */
  72       if ((c & 0377) == ' ')
  73         c &= ~0177 & ~ CHAR_CTL;
  74       else if ((c & 0377) == '?')
  75         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  76       /* ASCII control chars are made from letters (both cases),
  77          as well as the non-letters within 0100...0137.  */
  78       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  79         c &= (037 | (~0177 & ~CHAR_CTL));
  80       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  81         c &= (037 | (~0177 & ~CHAR_CTL));
  82     }
  83 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
  84   if (c & CHAR_META)
  85     {
  86       /* Move the meta bit to the right place for a string.  */
  87       c = (c & ~CHAR_META) | 0x80;
  88     }
  89 #endif
  90
  91   return c;
  92 }
  93
  94
  95 /* Store multibyte form of character C at P.  If C has modifier bits,
  96    handle them appropriately.  */
  97
  98 int
  99 char_string (unsigned int c, unsigned char *p)
 100 {
 101   int bytes;
 102
 103   if (c & CHAR_MODIFIER_MASK)
 104     {
 105       c = char_resolve_modifier_mask (c);
 106       /* If C still has any modifier bits, just ignore it.  */
 107       c &= ~CHAR_MODIFIER_MASK;
 108     }
 109
 110   if (c <= MAX_3_BYTE_CHAR)
 111     {
 112       bytes = CHAR_STRING (c, p);
 113     }
 114   else if (c <= MAX_4_BYTE_CHAR)
 115     {
 116       p[0] = (0xF0 | (c >> 18));
 117       p[1] = (0x80 | ((c >> 12) & 0x3F));
 118       p[2] = (0x80 | ((c >> 6) & 0x3F));
 119       p[3] = (0x80 | (c & 0x3F));
 120       bytes = 4;
 121     }
 122   else if (c <= MAX_5_BYTE_CHAR)
 123     {
 124       p[0] = 0xF8;
 125       p[1] = (0x80 | ((c >> 18) & 0x0F));
 126       p[2] = (0x80 | ((c >> 12) & 0x3F));
 127       p[3] = (0x80 | ((c >> 6) & 0x3F));
 128       p[4] = (0x80 | (c & 0x3F));
 129       bytes = 5;
 130     }
 131   else if (c <= MAX_CHAR)
 132     {
 133       c = CHAR_TO_BYTE8 (c);
 134       bytes = BYTE8_STRING (c, p);
 135     }
 136   else
 137     error ("Invalid character: %x", c);
 138
 139   return bytes;
 140 }
 141
 142
 143 /* Return a character whose multibyte form is at P.  If LEN is not
 144    NULL, it must be a pointer to integer.  In that case, set *LEN to
 145    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 146    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 147    the ending address (i.e., the starting address of the next
 148    character) of the multibyte form.  */
 149
 150 int
 151 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 152 {
 153   int c;
 154   const unsigned char *saved_p = p;
 155
 156   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 157     {
 158       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 159       c = STRING_CHAR_ADVANCE (p);
 160     }
 161   else if (! (*p & 0x08))
 162     {
 163       /* A 4-byte sequence of this form:
 164          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 165       c = ((((p)[0] & 0x7) << 18)
 166            | (((p)[1] & 0x3F) << 12)
 167            | (((p)[2] & 0x3F) << 6)
 168            | ((p)[3] & 0x3F));
 169       p += 4;
 170     }
 171   else
 172     {
 173       /* A 5-byte sequence of this form:
 174
 175          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 176
 177          Note that the top 4 `x's are always 0, so shifting p[1] can
 178          never exceed the maximum valid character codepoint. */
 179       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 180            (((p)[1] & 0x3F) << 18)
 181            | (((p)[2] & 0x3F) << 12)
 182            | (((p)[3] & 0x3F) << 6)
 183            | ((p)[4] & 0x3F));
 184       p += 5;
 185     }
 186
 187   if (len)
 188     *len = p - saved_p;
 189   if (advanced)
 190     *advanced = p;
 191   return c;
 192 }
 193
 194
 195 /* Translate character C by translation table TABLE.  If no translation is
 196    found in TABLE, return the untranslated character.  If TABLE is a list,
 197    elements are char tables.  In that case, recursively translate C by all the
 198    tables in the list.  */
 199
 200 int
 201 translate_char (Lisp_Object table, int c)
 202 {
 203   if (CHAR_TABLE_P (table))
 204     {
 205       Lisp_Object ch;
 206
 207       ch = CHAR_TABLE_REF (table, c);
 208       if (CHARACTERP (ch))
 209         c = XINT (ch);
 210     }
 211   else
 212     {
 213       for (; CONSP (table); table = XCDR (table))
 214         c = translate_char (XCAR (table), c);
 215     }
 216   return c;
 217 }
 218
 219 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 220        doc: /* Return non-nil if OBJECT is a character.
 221 In Emacs Lisp, characters are represented by character codes, which
 222 are non-negative integers.  The function `max-char' returns the
 223 maximum character code.
 224 usage: (characterp OBJECT)  */
 225        attributes: const)
 226   (Lisp_Object object, Lisp_Object ignore)
 227 {
 228   return (CHARACTERP (object) ? Qt : Qnil);
 229 }
 230
 231 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 232        doc: /* Return the character of the maximum code.  */
 233        attributes: const)
 234   (void)
 235 {
 236   return make_number (MAX_CHAR);
 237 }
 238
 239 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 240        Sunibyte_char_to_multibyte, 1, 1, 0,
 241        doc: /* Convert the byte CH to multibyte character.  */)
 242   (Lisp_Object ch)
 243 {
 244   int c;
 245
 246   CHECK_CHARACTER (ch);
 247   c = XFASTINT (ch);
 248   if (c >= 0x100)
 249     error ("Not a unibyte character: %d", c);
 250   MAKE_CHAR_MULTIBYTE (c);
 251   return make_number (c);
 252 }
 253
 254 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 255        Smultibyte_char_to_unibyte, 1, 1, 0,
 256        doc: /* Convert the multibyte character CH to a byte.
 257 If the multibyte character does not represent a byte, return -1.  */)
 258   (Lisp_Object ch)
 259 {
 260   int cm;
 261
 262   CHECK_CHARACTER (ch);
 263   cm = XFASTINT (ch);
 264   if (cm < 256)
 265     /* Can't distinguish a byte read from a unibyte buffer from
 266        a latin1 char, so let's let it slide.  */
 267     return ch;
 268   else
 269     {
 270       int cu = CHAR_TO_BYTE_SAFE (cm);
 271       return make_number (cu);
 272     }
 273 }
 274
 275
 276 /* Return width (columns) of C considering the buffer display table DP. */
 277
 278 static ptrdiff_t
 279 char_width (int c, struct Lisp_Char_Table *dp)
 280 {
 281   ptrdiff_t width = CHARACTER_WIDTH (c);
 282
 283   if (dp)
 284     {
 285       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 286       int i;
 287
 288       if (VECTORP (disp))
 289         for (i = 0, width = 0; i < ASIZE (disp); i++)
 290           {
 291             ch = AREF (disp, i);
 292             if (CHARACTERP (ch))
 293               {
 294                 int w = CHARACTER_WIDTH (XFASTINT (ch));
 295                 if (INT_ADD_WRAPV (width, w, &width))
 296                   string_overflow ();
 297               }
 298           }
 299     }
 300   return width;
 301 }
 302
 303
 304 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 305        doc: /* Return width of CHAR when displayed in the current buffer.
 306 The width is measured by how many columns it occupies on the screen.
 307 Tab is taken to occupy `tab-width' columns.
 308 usage: (char-width CHAR)  */)
 309   (Lisp_Object ch)
 310 {
 311   int c;
 312   ptrdiff_t width;
 313
 314   CHECK_CHARACTER (ch);
 315   c = XINT (ch);
 316   width = char_width (c, buffer_display_table ());
 317   return make_number (width);
 318 }
 319
 320 /* Return width of string STR of length LEN when displayed in the
 321    current buffer.  The width is measured by how many columns it
 322    occupies on the screen.  If PRECISION > 0, return the width of
 323    longest substring that doesn't exceed PRECISION, and set number of
 324    characters and bytes of the substring in *NCHARS and *NBYTES
 325    respectively.  */
 326
 327 ptrdiff_t
 328 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 329                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 330 {
 331   ptrdiff_t i = 0, i_byte = 0;
 332   ptrdiff_t width = 0;
 333   struct Lisp_Char_Table *dp = buffer_display_table ();
 334
 335   while (i_byte < len)
 336     {
 337       int bytes;
 338       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 339       ptrdiff_t thiswidth = char_width (c, dp);
 340
 341       if (0 < precision && precision - width < thiswidth)
 342         {
 343           *nchars = i;
 344           *nbytes = i_byte;
 345           return width;
 346         }
 347       if (INT_ADD_WRAPV (thiswidth, width, &width))
 348         string_overflow ();
 349       i++;
 350       i_byte += bytes;
 351   }
 352
 353   if (precision > 0)
 354     {
 355       *nchars = i;
 356       *nbytes = i_byte;
 357     }
 358
 359   return width;
 360 }
 361
 362 /* Return width of string STR of length LEN when displayed in the
 363    current buffer.  The width is measured by how many columns it
 364    occupies on the screen.  */
 365
 366 ptrdiff_t
 367 strwidth (const char *str, ptrdiff_t len)
 368 {
 369   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 370 }
 371
 372 /* Return width of Lisp string STRING when displayed in the current
 373    buffer.  The width is measured by how many columns it occupies on
 374    the screen while paying attention to compositions.  If PRECISION >
 375    0, return the width of longest substring that doesn't exceed
 376    PRECISION, and set number of characters and bytes of the substring
 377    in *NCHARS and *NBYTES respectively.  */
 378
 379 ptrdiff_t
 380 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 381                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 382 {
 383   ptrdiff_t len = SCHARS (string);
 384   /* This set multibyte to 0 even if STRING is multibyte when it
 385      contains only ascii and eight-bit-graphic, but that's
 386      intentional.  */
 387   bool multibyte = len < SBYTES (string);
 388   unsigned char *str = SDATA (string);
 389   ptrdiff_t i = 0, i_byte = 0;
 390   ptrdiff_t width = 0;
 391   struct Lisp_Char_Table *dp = buffer_display_table ();
 392
 393   while (i < len)
 394     {
 395       ptrdiff_t chars, bytes, thiswidth;
 396       Lisp_Object val;
 397       ptrdiff_t cmp_id;
 398       ptrdiff_t ignore, end;
 399
 400       if (find_composition (i, -1, &ignore, &end, &val, string)
 401           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 402               >= 0))
 403         {
 404           thiswidth = composition_table[cmp_id]->width;
 405           chars = end - i;
 406           bytes = string_char_to_byte (string, end) - i_byte;
 407         }
 408       else
 409         {
 410           int c;
 411
 412           if (multibyte)
 413             {
 414               int cbytes;
 415               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 416               bytes = cbytes;
 417             }
 418           else
 419             c = str[i_byte], bytes = 1;
 420           chars = 1;
 421           thiswidth = char_width (c, dp);
 422         }
 423
 424       if (0 < precision && precision - width < thiswidth)
 425         {
 426           *nchars = i;
 427           *nbytes = i_byte;
 428           return width;
 429         }
 430       if (INT_ADD_WRAPV (thiswidth, width, &width))
 431         string_overflow ();
 432       i += chars;
 433       i_byte += bytes;
 434     }
 435
 436   if (precision > 0)
 437     {
 438       *nchars = i;
 439       *nbytes = i_byte;
 440     }
 441
 442   return width;
 443 }
 444
 445 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 446        doc: /* Return width of STRING when displayed in the current buffer.
 447 Width is measured by how many columns it occupies on the screen.
 448 When calculating width of a multibyte character in STRING,
 449 only the base leading-code is considered; the validity of
 450 the following bytes is not checked.  Tabs in STRING are always
 451 taken to occupy `tab-width' columns.
 452 usage: (string-width STRING)  */)
 453   (Lisp_Object str)
 454 {
 455   Lisp_Object val;
 456
 457   CHECK_STRING (str);
 458   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 459   return val;
 460 }
 461
 462 /* Return the number of characters in the NBYTES bytes at PTR.
 463    This works by looking at the contents and checking for multibyte
 464    sequences while assuming that there's no invalid sequence.
 465    However, if the current buffer has enable-multibyte-characters =
 466    nil, we treat each byte as a character.  */
 467
 468 ptrdiff_t
 469 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 470 {
 471   /* current_buffer is null at early stages of Emacs initialization.  */
 472   if (current_buffer == 0
 473       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 474     return nbytes;
 475
 476   return multibyte_chars_in_text (ptr, nbytes);
 477 }
 478
 479 /* Return the number of characters in the NBYTES bytes at PTR.
 480    This works by looking at the contents and checking for multibyte
 481    sequences while assuming that there's no invalid sequence.  It
 482    ignores enable-multibyte-characters.  */
 483
 484 ptrdiff_t
 485 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 486 {
 487   const unsigned char *endp = ptr + nbytes;
 488   ptrdiff_t chars = 0;
 489
 490   while (ptr < endp)
 491     {
 492       int len = MULTIBYTE_LENGTH (ptr, endp);
 493
 494       if (len == 0)
 495         emacs_abort ();
 496       ptr += len;
 497       chars++;
 498     }
 499
 500   return chars;
 501 }
 502
 503 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 504    characters and bytes in it, and store them in *NCHARS and *NBYTES
 505    respectively.  On counting bytes, pay attention to that 8-bit
 506    characters not constructing a valid multibyte sequence are
 507    represented by 2-byte in a multibyte text.  */
 508
 509 void
 510 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 511                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 512 {
 513   const unsigned char *endp = str + len;
 514   int n;
 515   ptrdiff_t chars = 0, bytes = 0;
 516
 517   if (len >= MAX_MULTIBYTE_LENGTH)
 518     {
 519       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 520       while (str < adjusted_endp)
 521         {
 522           if (! CHAR_BYTE8_HEAD_P (*str)
 523               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 524             str += n, bytes += n;
 525           else
 526             str++, bytes += 2;
 527           chars++;
 528         }
 529     }
 530   while (str < endp)
 531     {
 532       if (! CHAR_BYTE8_HEAD_P (*str)
 533           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 534         str += n, bytes += n;
 535       else
 536         str++, bytes += 2;
 537       chars++;
 538     }
 539
 540   *nchars = chars;
 541   *nbytes = bytes;
 542   return;
 543 }
 544
 545 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 546    It actually converts only such 8-bit characters that don't construct
 547    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 548    NCHARS is nonzero, set *NCHARS to the number of characters in the
 549    text.  It is assured that we can use LEN bytes at STR as a work
 550    area and that is enough.  Return the number of bytes of the
 551    resulting text.  */
 552
 553 ptrdiff_t
 554 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 555                   ptrdiff_t *nchars)
 556 {
 557   unsigned char *p = str, *endp = str + nbytes;
 558   unsigned char *to;
 559   ptrdiff_t chars = 0;
 560   int n;
 561
 562   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 563     {
 564       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 565       while (p < adjusted_endp
 566              && ! CHAR_BYTE8_HEAD_P (*p)
 567              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 568         p += n, chars++;
 569     }
 570   while (p < endp
 571          && ! CHAR_BYTE8_HEAD_P (*p)
 572          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 573     p += n, chars++;
 574   if (nchars)
 575     *nchars = chars;
 576   if (p == endp)
 577     return nbytes;
 578
 579   to = p;
 580   nbytes = endp - p;
 581   endp = str + len;
 582   memmove (endp - nbytes, p, nbytes);
 583   p = endp - nbytes;
 584
 585   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 586     {
 587       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 588       while (p < adjusted_endp)
 589         {
 590           if (! CHAR_BYTE8_HEAD_P (*p)
 591               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 592             {
 593               while (n--)
 594                 *to++ = *p++;
 595             }
 596           else
 597             {
 598               int c = *p++;
 599               c = BYTE8_TO_CHAR (c);
 600               to += CHAR_STRING (c, to);
 601             }
 602         }
 603       chars++;
 604     }
 605   while (p < endp)
 606     {
 607       if (! CHAR_BYTE8_HEAD_P (*p)
 608           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 609         {
 610           while (n--)
 611             *to++ = *p++;
 612         }
 613       else
 614         {
 615           int c = *p++;
 616           c = BYTE8_TO_CHAR (c);
 617           to += CHAR_STRING (c, to);
 618         }
 619       chars++;
 620     }
 621   if (nchars)
 622     *nchars = chars;
 623   return (to - str);
 624 }
 625
 626 /* Parse unibyte string at STR of LEN bytes, and return the number of
 627    bytes it may occupy when converted to multibyte string by
 628    `str_to_multibyte'.  */
 629
 630 ptrdiff_t
 631 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 632 {
 633   const unsigned char *endp = str + len;
 634   ptrdiff_t bytes;
 635
 636   for (bytes = 0; str < endp; str++)
 637     {
 638       int n = *str < 0x80 ? 1 : 2;
 639       if (INT_ADD_WRAPV (bytes, n, &bytes))
 640         string_overflow ();
 641     }
 642   return bytes;
 643 }
 644
 645
 646 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 647    that contains the same single-byte characters.  It actually
 648    converts all 8-bit characters to multibyte forms.  It is assured
 649    that we can use LEN bytes at STR as a work area and that is
 650    enough.  */
 651
 652 ptrdiff_t
 653 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 654 {
 655   unsigned char *p = str, *endp = str + bytes;
 656   unsigned char *to;
 657
 658   while (p < endp && *p < 0x80) p++;
 659   if (p == endp)
 660     return bytes;
 661   to = p;
 662   bytes = endp - p;
 663   endp = str + len;
 664   memmove (endp - bytes, p, bytes);
 665   p = endp - bytes;
 666   while (p < endp)
 667     {
 668       int c = *p++;
 669
 670       if (c >= 0x80)
 671         c = BYTE8_TO_CHAR (c);
 672       to += CHAR_STRING (c, to);
 673     }
 674   return (to - str);
 675 }
 676
 677 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 678    actually converts characters in the range 0x80..0xFF to
 679    unibyte.  */
 680
 681 ptrdiff_t
 682 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 683 {
 684   const unsigned char *p = str, *endp = str + bytes;
 685   unsigned char *to;
 686   int c, len;
 687
 688   while (p < endp)
 689     {
 690       c = *p;
 691       len = BYTES_BY_CHAR_HEAD (c);
 692       if (CHAR_BYTE8_HEAD_P (c))
 693         break;
 694       p += len;
 695     }
 696   to = str + (p - str);
 697   while (p < endp)
 698     {
 699       c = *p;
 700       len = BYTES_BY_CHAR_HEAD (c);
 701       if (CHAR_BYTE8_HEAD_P (c))
 702         {
 703           c = STRING_CHAR_ADVANCE (p);
 704           *to++ = CHAR_TO_BYTE8 (c);
 705         }
 706       else
 707         {
 708           while (len--) *to++ = *p++;
 709         }
 710     }
 711   return (to - str);
 712 }
 713
 714 /* Convert eight-bit chars in SRC (in multibyte form) to the
 715    corresponding byte and store in DST.  CHARS is the number of
 716    characters in SRC.  The value is the number of bytes stored in DST.
 717    Usually, the value is the same as CHARS, but is less than it if SRC
 718    contains a non-ASCII, non-eight-bit character.  */
 719
 720 ptrdiff_t
 721 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 722 {
 723   ptrdiff_t i;
 724
 725   for (i = 0; i < chars; i++)
 726     {
 727       int c = STRING_CHAR_ADVANCE (src);
 728
 729       if (CHAR_BYTE8_P (c))
 730         c = CHAR_TO_BYTE8 (c);
 731       else if (! ASCII_CHAR_P (c))
 732         return i;
 733       *dst++ = c;
 734     }
 735   return i;
 736 }
 737
 738
 739 static ptrdiff_t
 740 string_count_byte8 (Lisp_Object string)
 741 {
 742   bool multibyte = STRING_MULTIBYTE (string);
 743   ptrdiff_t nbytes = SBYTES (string);
 744   unsigned char *p = SDATA (string);
 745   unsigned char *pend = p + nbytes;
 746   ptrdiff_t count = 0;
 747   int c, len;
 748
 749   if (multibyte)
 750     while (p < pend)
 751       {
 752         c = *p;
 753         len = BYTES_BY_CHAR_HEAD (c);
 754
 755         if (CHAR_BYTE8_HEAD_P (c))
 756           count++;
 757         p += len;
 758       }
 759   else
 760     while (p < pend)
 761       {
 762         if (*p++ >= 0x80)
 763           count++;
 764       }
 765   return count;
 766 }
 767
 768
 769 Lisp_Object
 770 string_escape_byte8 (Lisp_Object string)
 771 {
 772   ptrdiff_t nchars = SCHARS (string);
 773   ptrdiff_t nbytes = SBYTES (string);
 774   bool multibyte = STRING_MULTIBYTE (string);
 775   ptrdiff_t byte8_count;
 776   ptrdiff_t thrice_byte8_count, uninit_nchars, uninit_nbytes;
 777   const unsigned char *src, *src_end;
 778   unsigned char *dst;
 779   Lisp_Object val;
 780   int c, len;
 781
 782   if (multibyte && nchars == nbytes)
 783     return string;
 784
 785   byte8_count = string_count_byte8 (string);
 786
 787   if (byte8_count == 0)
 788     return string;
 789
 790   if (INT_MULTIPLY_WRAPV (byte8_count, 3, &thrice_byte8_count))
 791     string_overflow ();
 792
 793   if (multibyte)
 794     {
 795       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 796       if (INT_ADD_WRAPV (nchars, thrice_byte8_count, &uninit_nchars)
 797           || INT_ADD_WRAPV (nbytes, 2 * byte8_count, &uninit_nbytes))
 798         string_overflow ();
 799       val = make_uninit_multibyte_string (uninit_nchars, uninit_nbytes);
 800     }
 801   else
 802     {
 803       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 804       if (INT_ADD_WRAPV (thrice_byte8_count, nbytes, &uninit_nbytes))
 805         string_overflow ();
 806       val = make_uninit_string (uninit_nbytes);
 807     }
 808
 809   src = SDATA (string);
 810   src_end = src + nbytes;
 811   dst = SDATA (val);
 812   if (multibyte)
 813     while (src < src_end)
 814       {
 815         c = *src;
 816         len = BYTES_BY_CHAR_HEAD (c);
 817
 818         if (CHAR_BYTE8_HEAD_P (c))
 819           {
 820             c = STRING_CHAR_ADVANCE (src);
 821             c = CHAR_TO_BYTE8 (c);
 822             dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 823           }
 824         else
 825           while (len--) *dst++ = *src++;
 826       }
 827   else
 828     while (src < src_end)
 829       {
 830         c = *src++;
 831         if (c >= 0x80)
 832           dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 833         else
 834           *dst++ = c;
 835       }
 836   return val;
 837 }
 838
 839 \f
 840 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 841        doc: /*
 842 Concatenate all the argument characters and make the result a string.
 843 usage: (string &rest CHARACTERS)  */)
 844   (ptrdiff_t n, Lisp_Object *args)
 845 {
 846   ptrdiff_t i;
 847   int c;
 848   unsigned char *buf, *p;
 849   Lisp_Object str;
 850   USE_SAFE_ALLOCA;
 851
 852   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 853   p = buf;
 854
 855   for (i = 0; i < n; i++)
 856     {
 857       CHECK_CHARACTER (args[i]);
 858       c = XINT (args[i]);
 859       p += CHAR_STRING (c, p);
 860     }
 861
 862   str = make_string_from_bytes ((char *) buf, n, p - buf);
 863   SAFE_FREE ();
 864   return str;
 865 }
 866
 867 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 868        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 869 usage: (unibyte-string &rest BYTES)  */)
 870   (ptrdiff_t n, Lisp_Object *args)
 871 {
 872   ptrdiff_t i;
 873   Lisp_Object str;
 874   USE_SAFE_ALLOCA;
 875   unsigned char *buf = SAFE_ALLOCA (n);
 876   unsigned char *p = buf;
 877
 878   for (i = 0; i < n; i++)
 879     {
 880       CHECK_RANGED_INTEGER (args[i], 0, 255);
 881       *p++ = XINT (args[i]);
 882     }
 883
 884   str = make_string_from_bytes ((char *) buf, n, p - buf);
 885   SAFE_FREE ();
 886   return str;
 887 }
 888
 889 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 890        Schar_resolve_modifiers, 1, 1, 0,
 891        doc: /* Resolve modifiers in the character CHAR.
 892 The value is a character with modifiers resolved into the character
 893 code.  Unresolved modifiers are kept in the value.
 894 usage: (char-resolve-modifiers CHAR)  */)
 895   (Lisp_Object character)
 896 {
 897   EMACS_INT c;
 898
 899   CHECK_NUMBER (character);
 900   c = XINT (character);
 901   return make_number (char_resolve_modifier_mask (c));
 902 }
 903
 904 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 905        doc: /* Return a byte value of a character at point.
 906 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 907 a byte value.
 908 Optional 2nd arg STRING, if non-nil, is a string of which first
 909 character is a target to get a byte value.  In this case, POSITION, if
 910 non-nil, is an index of a target character in the string.
 911
 912 If the current buffer (or STRING) is multibyte, and the target
 913 character is not ASCII nor 8-bit character, an error is signaled.  */)
 914   (Lisp_Object position, Lisp_Object string)
 915 {
 916   int c;
 917   ptrdiff_t pos;
 918   unsigned char *p;
 919
 920   if (NILP (string))
 921     {
 922       if (NILP (position))
 923         {
 924           p = PT_ADDR;
 925         }
 926       else
 927         {
 928           CHECK_NUMBER_COERCE_MARKER (position);
 929           if (XINT (position) < BEGV || XINT (position) >= ZV)
 930             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 931           pos = XFASTINT (position);
 932           p = CHAR_POS_ADDR (pos);
 933         }
 934       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 935         return make_number (*p);
 936     }
 937   else
 938     {
 939       CHECK_STRING (string);
 940       if (NILP (position))
 941         {
 942           p = SDATA (string);
 943         }
 944       else
 945         {
 946           CHECK_NATNUM (position);
 947           if (XINT (position) >= SCHARS (string))
 948             args_out_of_range (string, position);
 949           pos = XFASTINT (position);
 950           p = SDATA (string) + string_char_to_byte (string, pos);
 951         }
 952       if (! STRING_MULTIBYTE (string))
 953         return make_number (*p);
 954     }
 955   c = STRING_CHAR (p);
 956   if (CHAR_BYTE8_P (c))
 957     c = CHAR_TO_BYTE8 (c);
 958   else if (! ASCII_CHAR_P (c))
 959     error ("Not an ASCII nor an 8-bit character: %d", c);
 960   return make_number (c);
 961 }
 962
 963 /* Return true if C is an alphabetic character.  */
 964 bool
 965 alphabeticp (int c)
 966 {
 967   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 968   if (! INTEGERP (category))
 969     return false;
 970   EMACS_INT gen_cat = XINT (category);
 971
 972   /* See UTS #18.  There are additional characters that should be
 973      here, those designated as Other_uppercase, Other_lowercase,
 974      and Other_alphabetic; FIXME.  */
 975   return (gen_cat == UNICODE_CATEGORY_Lu
 976           || gen_cat == UNICODE_CATEGORY_Ll
 977           || gen_cat == UNICODE_CATEGORY_Lt
 978           || gen_cat == UNICODE_CATEGORY_Lm
 979           || gen_cat == UNICODE_CATEGORY_Lo
 980           || gen_cat == UNICODE_CATEGORY_Mn
 981           || gen_cat == UNICODE_CATEGORY_Mc
 982           || gen_cat == UNICODE_CATEGORY_Me
 983           || gen_cat == UNICODE_CATEGORY_Nl);
 984 }
 985
 986 /* Return true if C is an alphabetic or decimal-number character.  */
 987 bool
 988 alphanumericp (int c)
 989 {
 990   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 991   if (! INTEGERP (category))
 992     return false;
 993   EMACS_INT gen_cat = XINT (category);
 994
 995   /* See UTS #18.  Same comment as for alphabeticp applies.  FIXME. */
 996   return (gen_cat == UNICODE_CATEGORY_Lu
 997           || gen_cat == UNICODE_CATEGORY_Ll
 998           || gen_cat == UNICODE_CATEGORY_Lt
 999           || gen_cat == UNICODE_CATEGORY_Lm
1000           || gen_cat == UNICODE_CATEGORY_Lo
1001           || gen_cat == UNICODE_CATEGORY_Mn
1002           || gen_cat == UNICODE_CATEGORY_Mc
1003           || gen_cat == UNICODE_CATEGORY_Me
1004           || gen_cat == UNICODE_CATEGORY_Nl
1005           || gen_cat == UNICODE_CATEGORY_Nd);
1006 }
1007
1008 /* Return true if C is a graphic character.  */
1009 bool
1010 graphicp (int c)
1011 {
1012   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1013   if (! INTEGERP (category))
1014     return false;
1015   EMACS_INT gen_cat = XINT (category);
1016
1017   /* See UTS #18.  */
1018   return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
1019             || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
1020             || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
1021             || gen_cat == UNICODE_CATEGORY_Cc /* control */
1022             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1023             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1024 }
1025
1026 /* Return true if C is a printable character.  */
1027 bool
1028 printablep (int c)
1029 {
1030   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1031   if (! INTEGERP (category))
1032     return false;
1033   EMACS_INT gen_cat = XINT (category);
1034
1035   /* See UTS #18.  */
1036   return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
1037             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1038             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1039 }
1040
1041 /* Return true if C is a horizontal whitespace character, as defined
1042    by http://www.unicode.org/reports/tr18/tr18-19.html#blank.  */
1043 bool
1044 blankp (int c)
1045 {
1046   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1047   if (! INTEGERP (category))
1048     return false;
1049
1050   return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
1051 }
1052
1053
1054 /* Return true for characters that would read as symbol characters,
1055    but graphically may be confused with some kind of punctuation.  We
1056    require an escaping backslash, when such characters begin a
1057    symbol.  */
1058 bool
1059 confusable_symbol_character_p (int ch)
1060 {
1061   switch (ch)
1062     {
1063     case 0x2018: /* LEFT SINGLE QUOTATION MARK */
1064     case 0x2019: /* RIGHT SINGLE QUOTATION MARK */
1065     case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
1066     case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
1067     case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
1068     case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
1069     case 0x301E: /* DOUBLE PRIME QUOTATION MARK */
1070     case 0xFF02: /* FULLWIDTH QUOTATION MARK */
1071     case 0xFF07: /* FULLWIDTH APOSTROPHE */
1072       return true;
1073
1074     default:
1075       return false;
1076     }
1077 }
1078
1079 signed char HEXDIGIT_CONST hexdigit[UCHAR_MAX + 1] =
1080   {
1081 #if HEXDIGIT_IS_CONST
1082     [0 ... UCHAR_MAX] = -1,
1083 #endif
1084     ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4,
1085     ['5'] = 5, ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9,
1086     ['A'] = 10, ['B'] = 11, ['C'] = 12, ['D'] = 13, ['E'] = 14, ['F'] = 15,
1087     ['a'] = 10, ['b'] = 11, ['c'] = 12, ['d'] = 13, ['e'] = 14, ['f'] = 15
1088   };
1089
1090 void
1091 syms_of_character (void)
1092 {
1093 #if !HEXDIGIT_IS_CONST
1094   /* Set the non-hex digit values to -1.  */
1095   for (int i = 0; i <= UCHAR_MAX; i++)
1096     hexdigit[i] -= i != '0' && !hexdigit[i];
1097 #endif
1098
1099   DEFSYM (Qcharacterp, "characterp");
1100   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1101
1102   staticpro (&Vchar_unify_table);
1103   Vchar_unify_table = Qnil;
1104
1105   defsubr (&Smax_char);
1106   defsubr (&Scharacterp);
1107   defsubr (&Sunibyte_char_to_multibyte);
1108   defsubr (&Smultibyte_char_to_unibyte);
1109   defsubr (&Schar_width);
1110   defsubr (&Sstring_width);
1111   defsubr (&Sstring);
1112   defsubr (&Sunibyte_string);
1113   defsubr (&Schar_resolve_modifiers);
1114   defsubr (&Sget_byte);
1115
1116   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1117                doc: /*
1118 Vector recording all translation tables ever defined.
1119 Each element is a pair (SYMBOL . TABLE) relating the table to the
1120 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1121   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1122
1123   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1124                doc: /*
1125 A char-table for characters which invoke auto-filling.
1126 Such characters have value t in this table.  */);
1127   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1128   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1129   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1130
1131   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1132                doc: /*
1133 A char-table for width (columns) of each character.  */);
1134   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1135   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1136   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1137                         make_number (4));
1138
1139   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1140                doc: /* A char-table for each printable character.  */);
1141   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1142   Fset_char_table_range (Vprintable_chars,
1143                          Fcons (make_number (32), make_number (126)), Qt);
1144   Fset_char_table_range (Vprintable_chars,
1145                          Fcons (make_number (160),
1146                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1147
1148   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1149                doc: /* Char table of script symbols.
1150 It has one extra slot whose value is a list of script symbols.  */);
1151
1152   DEFSYM (Qchar_script_table, "char-script-table");
1153   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1154   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1155
1156   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1157                doc: /* Alist of scripts vs the representative characters.
1158 Each element is a cons (SCRIPT . CHARS).
1159 SCRIPT is a symbol representing a script or a subgroup of a script.
1160 CHARS is a list or a vector of characters.
1161 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1162 If it is a vector, one of the characters in the vector is necessary.
1163 This variable is used to find a font for a specific script.  */);
1164   Vscript_representative_chars = Qnil;
1165
1166   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1167                doc: /* Char table of Unicode's "General Category".
1168 All Unicode characters have one of the following values (symbol):
1169   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1170   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1171 See The Unicode Standard for the meaning of those values.  */);
1172   /* The correct char-table is setup in characters.el.  */
1173   Vunicode_category_table = Qnil;
1174 }