src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2015 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <intprops.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 /* Char-table of information about which character to unify to which
  51    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  52 Lisp_Object Vchar_unify_table;
  53
  54 \f
  55
  56 /* If character code C has modifier masks, reflect them to the
  57    character code if possible.  Return the resulting code.  */
  58
  59 EMACS_INT
  60 char_resolve_modifier_mask (EMACS_INT c)
  61 {
  62   /* A non-ASCII character can't reflect modifier bits to the code.  */
  63   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  64     return c;
  65
  66   /* For Meta, Shift, and Control modifiers, we need special care.  */
  67   if (c & CHAR_SHIFT)
  68     {
  69       /* Shift modifier is valid only with [A-Za-z].  */
  70       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  71         c &= ~CHAR_SHIFT;
  72       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  73         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  74       /* Shift modifier for control characters and SPC is ignored.  */
  75       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  76         c &= ~CHAR_SHIFT;
  77     }
  78   if (c & CHAR_CTL)
  79     {
  80       /* Simulate the code in lread.c.  */
  81       /* Allow `\C- ' and `\C-?'.  */
  82       if ((c & 0377) == ' ')
  83         c &= ~0177 & ~ CHAR_CTL;
  84       else if ((c & 0377) == '?')
  85         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  86       /* ASCII control chars are made from letters (both cases),
  87          as well as the non-letters within 0100...0137.  */
  88       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  89         c &= (037 | (~0177 & ~CHAR_CTL));
  90       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  91         c &= (037 | (~0177 & ~CHAR_CTL));
  92     }
  93 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
  94   if (c & CHAR_META)
  95     {
  96       /* Move the meta bit to the right place for a string.  */
  97       c = (c & ~CHAR_META) | 0x80;
  98     }
  99 #endif
 100
 101   return c;
 102 }
 103
 104
 105 /* Store multibyte form of character C at P.  If C has modifier bits,
 106    handle them appropriately.  */
 107
 108 int
 109 char_string (unsigned int c, unsigned char *p)
 110 {
 111   int bytes;
 112
 113   if (c & CHAR_MODIFIER_MASK)
 114     {
 115       c = char_resolve_modifier_mask (c);
 116       /* If C still has any modifier bits, just ignore it.  */
 117       c &= ~CHAR_MODIFIER_MASK;
 118     }
 119
 120   if (c <= MAX_3_BYTE_CHAR)
 121     {
 122       bytes = CHAR_STRING (c, p);
 123     }
 124   else if (c <= MAX_4_BYTE_CHAR)
 125     {
 126       p[0] = (0xF0 | (c >> 18));
 127       p[1] = (0x80 | ((c >> 12) & 0x3F));
 128       p[2] = (0x80 | ((c >> 6) & 0x3F));
 129       p[3] = (0x80 | (c & 0x3F));
 130       bytes = 4;
 131     }
 132   else if (c <= MAX_5_BYTE_CHAR)
 133     {
 134       p[0] = 0xF8;
 135       p[1] = (0x80 | ((c >> 18) & 0x0F));
 136       p[2] = (0x80 | ((c >> 12) & 0x3F));
 137       p[3] = (0x80 | ((c >> 6) & 0x3F));
 138       p[4] = (0x80 | (c & 0x3F));
 139       bytes = 5;
 140     }
 141   else if (c <= MAX_CHAR)
 142     {
 143       c = CHAR_TO_BYTE8 (c);
 144       bytes = BYTE8_STRING (c, p);
 145     }
 146   else
 147     error ("Invalid character: %x", c);
 148
 149   return bytes;
 150 }
 151
 152
 153 /* Return a character whose multibyte form is at P.  If LEN is not
 154    NULL, it must be a pointer to integer.  In that case, set *LEN to
 155    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 156    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 157    the ending address (i.e., the starting address of the next
 158    character) of the multibyte form.  */
 159
 160 int
 161 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 162 {
 163   int c;
 164   const unsigned char *saved_p = p;
 165
 166   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 167     {
 168       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 169       c = STRING_CHAR_ADVANCE (p);
 170     }
 171   else if (! (*p & 0x08))
 172     {
 173       /* A 4-byte sequence of this form:
 174          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 175       c = ((((p)[0] & 0x7) << 18)
 176            | (((p)[1] & 0x3F) << 12)
 177            | (((p)[2] & 0x3F) << 6)
 178            | ((p)[3] & 0x3F));
 179       p += 4;
 180     }
 181   else
 182     {
 183       /* A 5-byte sequence of this form:
 184
 185          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 186
 187          Note that the top 4 `x's are always 0, so shifting p[1] can
 188          never exceed the maximum valid character codepoint. */
 189       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 190            (((p)[1] & 0x3F) << 18)
 191            | (((p)[2] & 0x3F) << 12)
 192            | (((p)[3] & 0x3F) << 6)
 193            | ((p)[4] & 0x3F));
 194       p += 5;
 195     }
 196
 197   if (len)
 198     *len = p - saved_p;
 199   if (advanced)
 200     *advanced = p;
 201   return c;
 202 }
 203
 204
 205 /* Translate character C by translation table TABLE.  If no translation is
 206    found in TABLE, return the untranslated character.  If TABLE is a list,
 207    elements are char tables.  In that case, recursively translate C by all the
 208    tables in the list.  */
 209
 210 int
 211 translate_char (Lisp_Object table, int c)
 212 {
 213   if (CHAR_TABLE_P (table))
 214     {
 215       Lisp_Object ch;
 216
 217       ch = CHAR_TABLE_REF (table, c);
 218       if (CHARACTERP (ch))
 219         c = XINT (ch);
 220     }
 221   else
 222     {
 223       for (; CONSP (table); table = XCDR (table))
 224         c = translate_char (XCAR (table), c);
 225     }
 226   return c;
 227 }
 228
 229 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 230        doc: /* Return non-nil if OBJECT is a character.
 231 In Emacs Lisp, characters are represented by character codes, which
 232 are non-negative integers.  The function `max-char' returns the
 233 maximum character code.
 234 usage: (characterp OBJECT)  */
 235        attributes: const)
 236   (Lisp_Object object, Lisp_Object ignore)
 237 {
 238   return (CHARACTERP (object) ? Qt : Qnil);
 239 }
 240
 241 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 242        doc: /* Return the character of the maximum code.  */
 243        attributes: const)
 244   (void)
 245 {
 246   return make_number (MAX_CHAR);
 247 }
 248
 249 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 250        Sunibyte_char_to_multibyte, 1, 1, 0,
 251        doc: /* Convert the byte CH to multibyte character.  */)
 252   (Lisp_Object ch)
 253 {
 254   int c;
 255
 256   CHECK_CHARACTER (ch);
 257   c = XFASTINT (ch);
 258   if (c >= 0x100)
 259     error ("Not a unibyte character: %d", c);
 260   MAKE_CHAR_MULTIBYTE (c);
 261   return make_number (c);
 262 }
 263
 264 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 265        Smultibyte_char_to_unibyte, 1, 1, 0,
 266        doc: /* Convert the multibyte character CH to a byte.
 267 If the multibyte character does not represent a byte, return -1.  */)
 268   (Lisp_Object ch)
 269 {
 270   int cm;
 271
 272   CHECK_CHARACTER (ch);
 273   cm = XFASTINT (ch);
 274   if (cm < 256)
 275     /* Can't distinguish a byte read from a unibyte buffer from
 276        a latin1 char, so let's let it slide.  */
 277     return ch;
 278   else
 279     {
 280       int cu = CHAR_TO_BYTE_SAFE (cm);
 281       return make_number (cu);
 282     }
 283 }
 284
 285
 286 /* Return width (columns) of C considering the buffer display table DP. */
 287
 288 static ptrdiff_t
 289 char_width (int c, struct Lisp_Char_Table *dp)
 290 {
 291   ptrdiff_t width = CHAR_WIDTH (c);
 292
 293   if (dp)
 294     {
 295       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 296       int i;
 297
 298       if (VECTORP (disp))
 299         for (i = 0, width = 0; i < ASIZE (disp); i++)
 300           {
 301             ch = AREF (disp, i);
 302             if (CHARACTERP (ch))
 303               {
 304                 int w = CHAR_WIDTH (XFASTINT (ch));
 305                 if (INT_ADD_OVERFLOW (width, w))
 306                   string_overflow ();
 307                 width += w;
 308               }
 309           }
 310     }
 311   return width;
 312 }
 313
 314
 315 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 316        doc: /* Return width of CHAR when displayed in the current buffer.
 317 The width is measured by how many columns it occupies on the screen.
 318 Tab is taken to occupy `tab-width' columns.
 319 usage: (char-width CHAR)  */)
 320   (Lisp_Object ch)
 321 {
 322   int c;
 323   ptrdiff_t width;
 324
 325   CHECK_CHARACTER (ch);
 326   c = XINT (ch);
 327   width = char_width (c, buffer_display_table ());
 328   return make_number (width);
 329 }
 330
 331 /* Return width of string STR of length LEN when displayed in the
 332    current buffer.  The width is measured by how many columns it
 333    occupies on the screen.  If PRECISION > 0, return the width of
 334    longest substring that doesn't exceed PRECISION, and set number of
 335    characters and bytes of the substring in *NCHARS and *NBYTES
 336    respectively.  */
 337
 338 ptrdiff_t
 339 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 340                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 341 {
 342   ptrdiff_t i = 0, i_byte = 0;
 343   ptrdiff_t width = 0;
 344   struct Lisp_Char_Table *dp = buffer_display_table ();
 345
 346   while (i_byte < len)
 347     {
 348       int bytes;
 349       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 350       ptrdiff_t thiswidth = char_width (c, dp);
 351
 352       if (precision <= 0)
 353         {
 354           if (INT_ADD_OVERFLOW (width, thiswidth))
 355             string_overflow ();
 356         }
 357       else if (precision - width < thiswidth)
 358         {
 359           *nchars = i;
 360           *nbytes = i_byte;
 361           return width;
 362         }
 363       i++;
 364       i_byte += bytes;
 365       width += thiswidth;
 366   }
 367
 368   if (precision > 0)
 369     {
 370       *nchars = i;
 371       *nbytes = i_byte;
 372     }
 373
 374   return width;
 375 }
 376
 377 /* Return width of string STR of length LEN when displayed in the
 378    current buffer.  The width is measured by how many columns it
 379    occupies on the screen.  */
 380
 381 ptrdiff_t
 382 strwidth (const char *str, ptrdiff_t len)
 383 {
 384   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 385 }
 386
 387 /* Return width of Lisp string STRING when displayed in the current
 388    buffer.  The width is measured by how many columns it occupies on
 389    the screen while paying attention to compositions.  If PRECISION >
 390    0, return the width of longest substring that doesn't exceed
 391    PRECISION, and set number of characters and bytes of the substring
 392    in *NCHARS and *NBYTES respectively.  */
 393
 394 ptrdiff_t
 395 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 396                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 397 {
 398   ptrdiff_t len = SCHARS (string);
 399   /* This set multibyte to 0 even if STRING is multibyte when it
 400      contains only ascii and eight-bit-graphic, but that's
 401      intentional.  */
 402   bool multibyte = len < SBYTES (string);
 403   unsigned char *str = SDATA (string);
 404   ptrdiff_t i = 0, i_byte = 0;
 405   ptrdiff_t width = 0;
 406   struct Lisp_Char_Table *dp = buffer_display_table ();
 407
 408   while (i < len)
 409     {
 410       ptrdiff_t chars, bytes, thiswidth;
 411       Lisp_Object val;
 412       ptrdiff_t cmp_id;
 413       ptrdiff_t ignore, end;
 414
 415       if (find_composition (i, -1, &ignore, &end, &val, string)
 416           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 417               >= 0))
 418         {
 419           thiswidth = composition_table[cmp_id]->width;
 420           chars = end - i;
 421           bytes = string_char_to_byte (string, end) - i_byte;
 422         }
 423       else
 424         {
 425           int c;
 426
 427           if (multibyte)
 428             {
 429               int cbytes;
 430               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 431               bytes = cbytes;
 432             }
 433           else
 434             c = str[i_byte], bytes = 1;
 435           chars = 1;
 436           thiswidth = char_width (c, dp);
 437         }
 438
 439       if (precision <= 0)
 440         {
 441 #ifdef emacs
 442           if (INT_ADD_OVERFLOW (width, thiswidth))
 443             string_overflow ();
 444 #endif
 445         }
 446       else if (precision - width < thiswidth)
 447         {
 448           *nchars = i;
 449           *nbytes = i_byte;
 450           return width;
 451         }
 452       i += chars;
 453       i_byte += bytes;
 454       width += thiswidth;
 455     }
 456
 457   if (precision > 0)
 458     {
 459       *nchars = i;
 460       *nbytes = i_byte;
 461     }
 462
 463   return width;
 464 }
 465
 466 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 467        doc: /* Return width of STRING when displayed in the current buffer.
 468 Width is measured by how many columns it occupies on the screen.
 469 When calculating width of a multibyte character in STRING,
 470 only the base leading-code is considered; the validity of
 471 the following bytes is not checked.  Tabs in STRING are always
 472 taken to occupy `tab-width' columns.
 473 usage: (string-width STRING)  */)
 474   (Lisp_Object str)
 475 {
 476   Lisp_Object val;
 477
 478   CHECK_STRING (str);
 479   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 480   return val;
 481 }
 482
 483 /* Return the number of characters in the NBYTES bytes at PTR.
 484    This works by looking at the contents and checking for multibyte
 485    sequences while assuming that there's no invalid sequence.
 486    However, if the current buffer has enable-multibyte-characters =
 487    nil, we treat each byte as a character.  */
 488
 489 ptrdiff_t
 490 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 491 {
 492   /* current_buffer is null at early stages of Emacs initialization.  */
 493   if (current_buffer == 0
 494       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 495     return nbytes;
 496
 497   return multibyte_chars_in_text (ptr, nbytes);
 498 }
 499
 500 /* Return the number of characters in the NBYTES bytes at PTR.
 501    This works by looking at the contents and checking for multibyte
 502    sequences while assuming that there's no invalid sequence.  It
 503    ignores enable-multibyte-characters.  */
 504
 505 ptrdiff_t
 506 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 507 {
 508   const unsigned char *endp = ptr + nbytes;
 509   ptrdiff_t chars = 0;
 510
 511   while (ptr < endp)
 512     {
 513       int len = MULTIBYTE_LENGTH (ptr, endp);
 514
 515       if (len == 0)
 516         emacs_abort ();
 517       ptr += len;
 518       chars++;
 519     }
 520
 521   return chars;
 522 }
 523
 524 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 525    characters and bytes in it, and store them in *NCHARS and *NBYTES
 526    respectively.  On counting bytes, pay attention to that 8-bit
 527    characters not constructing a valid multibyte sequence are
 528    represented by 2-byte in a multibyte text.  */
 529
 530 void
 531 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 532                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 533 {
 534   const unsigned char *endp = str + len;
 535   int n;
 536   ptrdiff_t chars = 0, bytes = 0;
 537
 538   if (len >= MAX_MULTIBYTE_LENGTH)
 539     {
 540       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 541       while (str < adjusted_endp)
 542         {
 543           if (! CHAR_BYTE8_HEAD_P (*str)
 544               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 545             str += n, bytes += n;
 546           else
 547             str++, bytes += 2;
 548           chars++;
 549         }
 550     }
 551   while (str < endp)
 552     {
 553       if (! CHAR_BYTE8_HEAD_P (*str)
 554           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 555         str += n, bytes += n;
 556       else
 557         str++, bytes += 2;
 558       chars++;
 559     }
 560
 561   *nchars = chars;
 562   *nbytes = bytes;
 563   return;
 564 }
 565
 566 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 567    It actually converts only such 8-bit characters that don't construct
 568    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 569    NCHARS is nonzero, set *NCHARS to the number of characters in the
 570    text.  It is assured that we can use LEN bytes at STR as a work
 571    area and that is enough.  Return the number of bytes of the
 572    resulting text.  */
 573
 574 ptrdiff_t
 575 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 576                   ptrdiff_t *nchars)
 577 {
 578   unsigned char *p = str, *endp = str + nbytes;
 579   unsigned char *to;
 580   ptrdiff_t chars = 0;
 581   int n;
 582
 583   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 584     {
 585       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 586       while (p < adjusted_endp
 587              && ! CHAR_BYTE8_HEAD_P (*p)
 588              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 589         p += n, chars++;
 590     }
 591   while (p < endp
 592          && ! CHAR_BYTE8_HEAD_P (*p)
 593          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 594     p += n, chars++;
 595   if (nchars)
 596     *nchars = chars;
 597   if (p == endp)
 598     return nbytes;
 599
 600   to = p;
 601   nbytes = endp - p;
 602   endp = str + len;
 603   memmove (endp - nbytes, p, nbytes);
 604   p = endp - nbytes;
 605
 606   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 607     {
 608       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 609       while (p < adjusted_endp)
 610         {
 611           if (! CHAR_BYTE8_HEAD_P (*p)
 612               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 613             {
 614               while (n--)
 615                 *to++ = *p++;
 616             }
 617           else
 618             {
 619               int c = *p++;
 620               c = BYTE8_TO_CHAR (c);
 621               to += CHAR_STRING (c, to);
 622             }
 623         }
 624       chars++;
 625     }
 626   while (p < endp)
 627     {
 628       if (! CHAR_BYTE8_HEAD_P (*p)
 629           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 630         {
 631           while (n--)
 632             *to++ = *p++;
 633         }
 634       else
 635         {
 636           int c = *p++;
 637           c = BYTE8_TO_CHAR (c);
 638           to += CHAR_STRING (c, to);
 639         }
 640       chars++;
 641     }
 642   if (nchars)
 643     *nchars = chars;
 644   return (to - str);
 645 }
 646
 647 /* Parse unibyte string at STR of LEN bytes, and return the number of
 648    bytes it may occupy when converted to multibyte string by
 649    `str_to_multibyte'.  */
 650
 651 ptrdiff_t
 652 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 653 {
 654   const unsigned char *endp = str + len;
 655   ptrdiff_t bytes;
 656
 657   for (bytes = 0; str < endp; str++)
 658     {
 659       int n = *str < 0x80 ? 1 : 2;
 660       if (INT_ADD_OVERFLOW (bytes, n))
 661         string_overflow ();
 662       bytes += n;
 663     }
 664   return bytes;
 665 }
 666
 667
 668 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 669    that contains the same single-byte characters.  It actually
 670    converts all 8-bit characters to multibyte forms.  It is assured
 671    that we can use LEN bytes at STR as a work area and that is
 672    enough.  */
 673
 674 ptrdiff_t
 675 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 676 {
 677   unsigned char *p = str, *endp = str + bytes;
 678   unsigned char *to;
 679
 680   while (p < endp && *p < 0x80) p++;
 681   if (p == endp)
 682     return bytes;
 683   to = p;
 684   bytes = endp - p;
 685   endp = str + len;
 686   memmove (endp - bytes, p, bytes);
 687   p = endp - bytes;
 688   while (p < endp)
 689     {
 690       int c = *p++;
 691
 692       if (c >= 0x80)
 693         c = BYTE8_TO_CHAR (c);
 694       to += CHAR_STRING (c, to);
 695     }
 696   return (to - str);
 697 }
 698
 699 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 700    actually converts characters in the range 0x80..0xFF to
 701    unibyte.  */
 702
 703 ptrdiff_t
 704 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 705 {
 706   const unsigned char *p = str, *endp = str + bytes;
 707   unsigned char *to;
 708   int c, len;
 709
 710   while (p < endp)
 711     {
 712       c = *p;
 713       len = BYTES_BY_CHAR_HEAD (c);
 714       if (CHAR_BYTE8_HEAD_P (c))
 715         break;
 716       p += len;
 717     }
 718   to = str + (p - str);
 719   while (p < endp)
 720     {
 721       c = *p;
 722       len = BYTES_BY_CHAR_HEAD (c);
 723       if (CHAR_BYTE8_HEAD_P (c))
 724         {
 725           c = STRING_CHAR_ADVANCE (p);
 726           *to++ = CHAR_TO_BYTE8 (c);
 727         }
 728       else
 729         {
 730           while (len--) *to++ = *p++;
 731         }
 732     }
 733   return (to - str);
 734 }
 735
 736 /* Convert eight-bit chars in SRC (in multibyte form) to the
 737    corresponding byte and store in DST.  CHARS is the number of
 738    characters in SRC.  The value is the number of bytes stored in DST.
 739    Usually, the value is the same as CHARS, but is less than it if SRC
 740    contains a non-ASCII, non-eight-bit character.  */
 741
 742 ptrdiff_t
 743 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 744 {
 745   ptrdiff_t i;
 746
 747   for (i = 0; i < chars; i++)
 748     {
 749       int c = STRING_CHAR_ADVANCE (src);
 750
 751       if (CHAR_BYTE8_P (c))
 752         c = CHAR_TO_BYTE8 (c);
 753       else if (! ASCII_CHAR_P (c))
 754         return i;
 755       *dst++ = c;
 756     }
 757   return i;
 758 }
 759
 760
 761 static ptrdiff_t
 762 string_count_byte8 (Lisp_Object string)
 763 {
 764   bool multibyte = STRING_MULTIBYTE (string);
 765   ptrdiff_t nbytes = SBYTES (string);
 766   unsigned char *p = SDATA (string);
 767   unsigned char *pend = p + nbytes;
 768   ptrdiff_t count = 0;
 769   int c, len;
 770
 771   if (multibyte)
 772     while (p < pend)
 773       {
 774         c = *p;
 775         len = BYTES_BY_CHAR_HEAD (c);
 776
 777         if (CHAR_BYTE8_HEAD_P (c))
 778           count++;
 779         p += len;
 780       }
 781   else
 782     while (p < pend)
 783       {
 784         if (*p++ >= 0x80)
 785           count++;
 786       }
 787   return count;
 788 }
 789
 790
 791 Lisp_Object
 792 string_escape_byte8 (Lisp_Object string)
 793 {
 794   ptrdiff_t nchars = SCHARS (string);
 795   ptrdiff_t nbytes = SBYTES (string);
 796   bool multibyte = STRING_MULTIBYTE (string);
 797   ptrdiff_t byte8_count;
 798   const unsigned char *src, *src_end;
 799   unsigned char *dst;
 800   Lisp_Object val;
 801   int c, len;
 802
 803   if (multibyte && nchars == nbytes)
 804     return string;
 805
 806   byte8_count = string_count_byte8 (string);
 807
 808   if (byte8_count == 0)
 809     return string;
 810
 811   if (multibyte)
 812     {
 813       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 814           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 815         string_overflow ();
 816
 817       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 818       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 819                                           nbytes + byte8_count * 2);
 820     }
 821   else
 822     {
 823       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 824         string_overflow ();
 825
 826       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 827       val = make_uninit_string (nbytes + byte8_count * 3);
 828     }
 829
 830   src = SDATA (string);
 831   src_end = src + nbytes;
 832   dst = SDATA (val);
 833   if (multibyte)
 834     while (src < src_end)
 835       {
 836         c = *src;
 837         len = BYTES_BY_CHAR_HEAD (c);
 838
 839         if (CHAR_BYTE8_HEAD_P (c))
 840           {
 841             c = STRING_CHAR_ADVANCE (src);
 842             c = CHAR_TO_BYTE8 (c);
 843             dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 844           }
 845         else
 846           while (len--) *dst++ = *src++;
 847       }
 848   else
 849     while (src < src_end)
 850       {
 851         c = *src++;
 852         if (c >= 0x80)
 853           dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 854         else
 855           *dst++ = c;
 856       }
 857   return val;
 858 }
 859
 860 \f
 861 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 862        doc: /*
 863 Concatenate all the argument characters and make the result a string.
 864 usage: (string &rest CHARACTERS)  */)
 865   (ptrdiff_t n, Lisp_Object *args)
 866 {
 867   ptrdiff_t i;
 868   int c;
 869   unsigned char *buf, *p;
 870   Lisp_Object str;
 871   USE_SAFE_ALLOCA;
 872
 873   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 874   p = buf;
 875
 876   for (i = 0; i < n; i++)
 877     {
 878       CHECK_CHARACTER (args[i]);
 879       c = XINT (args[i]);
 880       p += CHAR_STRING (c, p);
 881     }
 882
 883   str = make_string_from_bytes ((char *) buf, n, p - buf);
 884   SAFE_FREE ();
 885   return str;
 886 }
 887
 888 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 889        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 890 usage: (unibyte-string &rest BYTES)  */)
 891   (ptrdiff_t n, Lisp_Object *args)
 892 {
 893   ptrdiff_t i;
 894   Lisp_Object str;
 895   USE_SAFE_ALLOCA;
 896   unsigned char *buf = SAFE_ALLOCA (n);
 897   unsigned char *p = buf;
 898
 899   for (i = 0; i < n; i++)
 900     {
 901       CHECK_RANGED_INTEGER (args[i], 0, 255);
 902       *p++ = XINT (args[i]);
 903     }
 904
 905   str = make_string_from_bytes ((char *) buf, n, p - buf);
 906   SAFE_FREE ();
 907   return str;
 908 }
 909
 910 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 911        Schar_resolve_modifiers, 1, 1, 0,
 912        doc: /* Resolve modifiers in the character CHAR.
 913 The value is a character with modifiers resolved into the character
 914 code.  Unresolved modifiers are kept in the value.
 915 usage: (char-resolve-modifiers CHAR)  */)
 916   (Lisp_Object character)
 917 {
 918   EMACS_INT c;
 919
 920   CHECK_NUMBER (character);
 921   c = XINT (character);
 922   return make_number (char_resolve_modifier_mask (c));
 923 }
 924
 925 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 926        doc: /* Return a byte value of a character at point.
 927 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 928 a byte value.
 929 Optional 2nd arg STRING, if non-nil, is a string of which first
 930 character is a target to get a byte value.  In this case, POSITION, if
 931 non-nil, is an index of a target character in the string.
 932
 933 If the current buffer (or STRING) is multibyte, and the target
 934 character is not ASCII nor 8-bit character, an error is signaled.  */)
 935   (Lisp_Object position, Lisp_Object string)
 936 {
 937   int c;
 938   ptrdiff_t pos;
 939   unsigned char *p;
 940
 941   if (NILP (string))
 942     {
 943       if (NILP (position))
 944         {
 945           p = PT_ADDR;
 946         }
 947       else
 948         {
 949           CHECK_NUMBER_COERCE_MARKER (position);
 950           if (XINT (position) < BEGV || XINT (position) >= ZV)
 951             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 952           pos = XFASTINT (position);
 953           p = CHAR_POS_ADDR (pos);
 954         }
 955       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 956         return make_number (*p);
 957     }
 958   else
 959     {
 960       CHECK_STRING (string);
 961       if (NILP (position))
 962         {
 963           p = SDATA (string);
 964         }
 965       else
 966         {
 967           CHECK_NATNUM (position);
 968           if (XINT (position) >= SCHARS (string))
 969             args_out_of_range (string, position);
 970           pos = XFASTINT (position);
 971           p = SDATA (string) + string_char_to_byte (string, pos);
 972         }
 973       if (! STRING_MULTIBYTE (string))
 974         return make_number (*p);
 975     }
 976   c = STRING_CHAR (p);
 977   if (CHAR_BYTE8_P (c))
 978     c = CHAR_TO_BYTE8 (c);
 979   else if (! ASCII_CHAR_P (c))
 980     error ("Not an ASCII nor an 8-bit character: %d", c);
 981   return make_number (c);
 982 }
 983
 984 #ifdef emacs
 985
 986 /* Return true if C is an alphabetic character.  */
 987 bool
 988 alphabeticp (int c)
 989 {
 990   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 991   if (! INTEGERP (category))
 992     return false;
 993   EMACS_INT gen_cat = XINT (category);
 994
 995   /* See UTS #18.  There are additional characters that should be
 996      here, those designated as Other_uppercase, Other_lowercase,
 997      and Other_alphabetic; FIXME.  */
 998   return (gen_cat == UNICODE_CATEGORY_Lu
 999           || gen_cat == UNICODE_CATEGORY_Ll
1000           || gen_cat == UNICODE_CATEGORY_Lt
1001           || gen_cat == UNICODE_CATEGORY_Lm
1002           || gen_cat == UNICODE_CATEGORY_Lo
1003           || gen_cat == UNICODE_CATEGORY_Mn
1004           || gen_cat == UNICODE_CATEGORY_Mc
1005           || gen_cat == UNICODE_CATEGORY_Me
1006           || gen_cat == UNICODE_CATEGORY_Nl);
1007 }
1008
1009 /* Return true if C is a decimal-number character.  */
1010 bool
1011 decimalnump (int c)
1012 {
1013   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1014   if (! INTEGERP (category))
1015     return false;
1016   EMACS_INT gen_cat = XINT (category);
1017
1018   /* See UTS #18.  */
1019   return gen_cat == UNICODE_CATEGORY_Nd;
1020 }
1021
1022 /* Return true if C is a graphic character.  */
1023 bool
1024 graphicp (int c)
1025 {
1026   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1027   if (! INTEGERP (category))
1028     return false;
1029   EMACS_INT gen_cat = XINT (category);
1030
1031   /* See UTS #18.  */
1032   return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
1033             || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
1034             || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
1035             || gen_cat == UNICODE_CATEGORY_Cc /* control */
1036             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1037             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1038 }
1039
1040 /* Return true if C is a printable character.  */
1041 bool
1042 printablep (int c)
1043 {
1044   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1045   if (! INTEGERP (category))
1046     return false;
1047   EMACS_INT gen_cat = XINT (category);
1048
1049   /* See UTS #18.  */
1050   return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
1051             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1052             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1053 }
1054
1055 void
1056 syms_of_character (void)
1057 {
1058   DEFSYM (Qcharacterp, "characterp");
1059   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1060
1061   staticpro (&Vchar_unify_table);
1062   Vchar_unify_table = Qnil;
1063
1064   defsubr (&Smax_char);
1065   defsubr (&Scharacterp);
1066   defsubr (&Sunibyte_char_to_multibyte);
1067   defsubr (&Smultibyte_char_to_unibyte);
1068   defsubr (&Schar_width);
1069   defsubr (&Sstring_width);
1070   defsubr (&Sstring);
1071   defsubr (&Sunibyte_string);
1072   defsubr (&Schar_resolve_modifiers);
1073   defsubr (&Sget_byte);
1074
1075   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1076                doc: /*
1077 Vector recording all translation tables ever defined.
1078 Each element is a pair (SYMBOL . TABLE) relating the table to the
1079 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1080   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1081
1082   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1083                doc: /*
1084 A char-table for characters which invoke auto-filling.
1085 Such characters have value t in this table.  */);
1086   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1087   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1088   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1089
1090   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1091                doc: /*
1092 A char-table for width (columns) of each character.  */);
1093   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1094   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1095   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1096                         make_number (4));
1097
1098   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1099                doc: /* A char-table for each printable character.  */);
1100   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1101   Fset_char_table_range (Vprintable_chars,
1102                          Fcons (make_number (32), make_number (126)), Qt);
1103   Fset_char_table_range (Vprintable_chars,
1104                          Fcons (make_number (160),
1105                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1106
1107   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1108                doc: /* Char table of script symbols.
1109 It has one extra slot whose value is a list of script symbols.  */);
1110
1111   DEFSYM (Qchar_script_table, "char-script-table");
1112   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1113   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1114
1115   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1116                doc: /* Alist of scripts vs the representative characters.
1117 Each element is a cons (SCRIPT . CHARS).
1118 SCRIPT is a symbol representing a script or a subgroup of a script.
1119 CHARS is a list or a vector of characters.
1120 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1121 If it is a vector, one of the characters in the vector is necessary.
1122 This variable is used to find a font for a specific script.  */);
1123   Vscript_representative_chars = Qnil;
1124
1125   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1126                doc: /* Char table of Unicode's "General Category".
1127 All Unicode characters have one of the following values (symbol):
1128   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1129   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1130 See The Unicode Standard for the meaning of those values.  */);
1131   /* The correct char-table is setup in characters.el.  */
1132   Vunicode_category_table = Qnil;
1133 }
1134
1135 #endif /* emacs */