src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2015 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <intprops.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 /* Char-table of information about which character to unify to which
  52    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  53 Lisp_Object Vchar_unify_table;
  54
  55 \f
  56
  57 /* If character code C has modifier masks, reflect them to the
  58    character code if possible.  Return the resulting code.  */
  59
  60 EMACS_INT
  61 char_resolve_modifier_mask (EMACS_INT c)
  62 {
  63   /* A non-ASCII character can't reflect modifier bits to the code.  */
  64   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  65     return c;
  66
  67   /* For Meta, Shift, and Control modifiers, we need special care.  */
  68   if (c & CHAR_SHIFT)
  69     {
  70       /* Shift modifier is valid only with [A-Za-z].  */
  71       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  72         c &= ~CHAR_SHIFT;
  73       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  74         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  75       /* Shift modifier for control characters and SPC is ignored.  */
  76       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  77         c &= ~CHAR_SHIFT;
  78     }
  79   if (c & CHAR_CTL)
  80     {
  81       /* Simulate the code in lread.c.  */
  82       /* Allow `\C- ' and `\C-?'.  */
  83       if ((c & 0377) == ' ')
  84         c &= ~0177 & ~ CHAR_CTL;
  85       else if ((c & 0377) == '?')
  86         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  87       /* ASCII control chars are made from letters (both cases),
  88          as well as the non-letters within 0100...0137.  */
  89       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  90         c &= (037 | (~0177 & ~CHAR_CTL));
  91       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  92         c &= (037 | (~0177 & ~CHAR_CTL));
  93     }
  94 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
  95   if (c & CHAR_META)
  96     {
  97       /* Move the meta bit to the right place for a string.  */
  98       c = (c & ~CHAR_META) | 0x80;
  99     }
 100 #endif
 101
 102   return c;
 103 }
 104
 105
 106 /* Store multibyte form of character C at P.  If C has modifier bits,
 107    handle them appropriately.  */
 108
 109 int
 110 char_string (unsigned int c, unsigned char *p)
 111 {
 112   int bytes;
 113
 114   if (c & CHAR_MODIFIER_MASK)
 115     {
 116       c = char_resolve_modifier_mask (c);
 117       /* If C still has any modifier bits, just ignore it.  */
 118       c &= ~CHAR_MODIFIER_MASK;
 119     }
 120
 121   if (c <= MAX_3_BYTE_CHAR)
 122     {
 123       bytes = CHAR_STRING (c, p);
 124     }
 125   else if (c <= MAX_4_BYTE_CHAR)
 126     {
 127       p[0] = (0xF0 | (c >> 18));
 128       p[1] = (0x80 | ((c >> 12) & 0x3F));
 129       p[2] = (0x80 | ((c >> 6) & 0x3F));
 130       p[3] = (0x80 | (c & 0x3F));
 131       bytes = 4;
 132     }
 133   else if (c <= MAX_5_BYTE_CHAR)
 134     {
 135       p[0] = 0xF8;
 136       p[1] = (0x80 | ((c >> 18) & 0x0F));
 137       p[2] = (0x80 | ((c >> 12) & 0x3F));
 138       p[3] = (0x80 | ((c >> 6) & 0x3F));
 139       p[4] = (0x80 | (c & 0x3F));
 140       bytes = 5;
 141     }
 142   else if (c <= MAX_CHAR)
 143     {
 144       c = CHAR_TO_BYTE8 (c);
 145       bytes = BYTE8_STRING (c, p);
 146     }
 147   else
 148     error ("Invalid character: %x", c);
 149
 150   return bytes;
 151 }
 152
 153
 154 /* Return a character whose multibyte form is at P.  If LEN is not
 155    NULL, it must be a pointer to integer.  In that case, set *LEN to
 156    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 157    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 158    the ending address (i.e., the starting address of the next
 159    character) of the multibyte form.  */
 160
 161 int
 162 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 163 {
 164   int c;
 165   const unsigned char *saved_p = p;
 166
 167   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 168     {
 169       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 170       c = STRING_CHAR_ADVANCE (p);
 171     }
 172   else if (! (*p & 0x08))
 173     {
 174       /* A 4-byte sequence of this form:
 175          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 176       c = ((((p)[0] & 0x7) << 18)
 177            | (((p)[1] & 0x3F) << 12)
 178            | (((p)[2] & 0x3F) << 6)
 179            | ((p)[3] & 0x3F));
 180       p += 4;
 181     }
 182   else
 183     {
 184       /* A 5-byte sequence of this form:
 185
 186          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 187
 188          Note that the top 4 `x's are always 0, so shifting p[1] can
 189          never exceed the maximum valid character codepoint. */
 190       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 191            (((p)[1] & 0x3F) << 18)
 192            | (((p)[2] & 0x3F) << 12)
 193            | (((p)[3] & 0x3F) << 6)
 194            | ((p)[4] & 0x3F));
 195       p += 5;
 196     }
 197
 198   if (len)
 199     *len = p - saved_p;
 200   if (advanced)
 201     *advanced = p;
 202   return c;
 203 }
 204
 205
 206 /* Translate character C by translation table TABLE.  If no translation is
 207    found in TABLE, return the untranslated character.  If TABLE is a list,
 208    elements are char tables.  In that case, recursively translate C by all the
 209    tables in the list.  */
 210
 211 int
 212 translate_char (Lisp_Object table, int c)
 213 {
 214   if (CHAR_TABLE_P (table))
 215     {
 216       Lisp_Object ch;
 217
 218       ch = CHAR_TABLE_REF (table, c);
 219       if (CHARACTERP (ch))
 220         c = XINT (ch);
 221     }
 222   else
 223     {
 224       for (; CONSP (table); table = XCDR (table))
 225         c = translate_char (XCAR (table), c);
 226     }
 227   return c;
 228 }
 229
 230 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 231        doc: /* Return non-nil if OBJECT is a character.
 232 In Emacs Lisp, characters are represented by character codes, which
 233 are non-negative integers.  The function `max-char' returns the
 234 maximum character code.
 235 usage: (characterp OBJECT)  */
 236        attributes: const)
 237   (Lisp_Object object, Lisp_Object ignore)
 238 {
 239   return (CHARACTERP (object) ? Qt : Qnil);
 240 }
 241
 242 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 243        doc: /* Return the character of the maximum code.  */
 244        attributes: const)
 245   (void)
 246 {
 247   return make_number (MAX_CHAR);
 248 }
 249
 250 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 251        Sunibyte_char_to_multibyte, 1, 1, 0,
 252        doc: /* Convert the byte CH to multibyte character.  */)
 253   (Lisp_Object ch)
 254 {
 255   int c;
 256
 257   CHECK_CHARACTER (ch);
 258   c = XFASTINT (ch);
 259   if (c >= 0x100)
 260     error ("Not a unibyte character: %d", c);
 261   MAKE_CHAR_MULTIBYTE (c);
 262   return make_number (c);
 263 }
 264
 265 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 266        Smultibyte_char_to_unibyte, 1, 1, 0,
 267        doc: /* Convert the multibyte character CH to a byte.
 268 If the multibyte character does not represent a byte, return -1.  */)
 269   (Lisp_Object ch)
 270 {
 271   int cm;
 272
 273   CHECK_CHARACTER (ch);
 274   cm = XFASTINT (ch);
 275   if (cm < 256)
 276     /* Can't distinguish a byte read from a unibyte buffer from
 277        a latin1 char, so let's let it slide.  */
 278     return ch;
 279   else
 280     {
 281       int cu = CHAR_TO_BYTE_SAFE (cm);
 282       return make_number (cu);
 283     }
 284 }
 285
 286
 287 /* Return width (columns) of C considering the buffer display table DP. */
 288
 289 static ptrdiff_t
 290 char_width (int c, struct Lisp_Char_Table *dp)
 291 {
 292   ptrdiff_t width = CHAR_WIDTH (c);
 293
 294   if (dp)
 295     {
 296       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 297       int i;
 298
 299       if (VECTORP (disp))
 300         for (i = 0, width = 0; i < ASIZE (disp); i++)
 301           {
 302             ch = AREF (disp, i);
 303             if (CHARACTERP (ch))
 304               {
 305                 int w = CHAR_WIDTH (XFASTINT (ch));
 306                 if (INT_ADD_OVERFLOW (width, w))
 307                   string_overflow ();
 308                 width += w;
 309               }
 310           }
 311     }
 312   return width;
 313 }
 314
 315
 316 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 317        doc: /* Return width of CHAR when displayed in the current buffer.
 318 The width is measured by how many columns it occupies on the screen.
 319 Tab is taken to occupy `tab-width' columns.
 320 usage: (char-width CHAR)  */)
 321   (Lisp_Object ch)
 322 {
 323   int c;
 324   ptrdiff_t width;
 325
 326   CHECK_CHARACTER (ch);
 327   c = XINT (ch);
 328   width = char_width (c, buffer_display_table ());
 329   return make_number (width);
 330 }
 331
 332 /* Return width of string STR of length LEN when displayed in the
 333    current buffer.  The width is measured by how many columns it
 334    occupies on the screen.  If PRECISION > 0, return the width of
 335    longest substring that doesn't exceed PRECISION, and set number of
 336    characters and bytes of the substring in *NCHARS and *NBYTES
 337    respectively.  */
 338
 339 ptrdiff_t
 340 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 341                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 342 {
 343   ptrdiff_t i = 0, i_byte = 0;
 344   ptrdiff_t width = 0;
 345   struct Lisp_Char_Table *dp = buffer_display_table ();
 346
 347   while (i_byte < len)
 348     {
 349       int bytes;
 350       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 351       ptrdiff_t thiswidth = char_width (c, dp);
 352
 353       if (precision <= 0)
 354         {
 355           if (INT_ADD_OVERFLOW (width, thiswidth))
 356             string_overflow ();
 357         }
 358       else if (precision - width < thiswidth)
 359         {
 360           *nchars = i;
 361           *nbytes = i_byte;
 362           return width;
 363         }
 364       i++;
 365       i_byte += bytes;
 366       width += thiswidth;
 367   }
 368
 369   if (precision > 0)
 370     {
 371       *nchars = i;
 372       *nbytes = i_byte;
 373     }
 374
 375   return width;
 376 }
 377
 378 /* Return width of string STR of length LEN when displayed in the
 379    current buffer.  The width is measured by how many columns it
 380    occupies on the screen.  */
 381
 382 ptrdiff_t
 383 strwidth (const char *str, ptrdiff_t len)
 384 {
 385   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 386 }
 387
 388 /* Return width of Lisp string STRING when displayed in the current
 389    buffer.  The width is measured by how many columns it occupies on
 390    the screen while paying attention to compositions.  If PRECISION >
 391    0, return the width of longest substring that doesn't exceed
 392    PRECISION, and set number of characters and bytes of the substring
 393    in *NCHARS and *NBYTES respectively.  */
 394
 395 ptrdiff_t
 396 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 397                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 398 {
 399   ptrdiff_t len = SCHARS (string);
 400   /* This set multibyte to 0 even if STRING is multibyte when it
 401      contains only ascii and eight-bit-graphic, but that's
 402      intentional.  */
 403   bool multibyte = len < SBYTES (string);
 404   unsigned char *str = SDATA (string);
 405   ptrdiff_t i = 0, i_byte = 0;
 406   ptrdiff_t width = 0;
 407   struct Lisp_Char_Table *dp = buffer_display_table ();
 408
 409   while (i < len)
 410     {
 411       ptrdiff_t chars, bytes, thiswidth;
 412       Lisp_Object val;
 413       ptrdiff_t cmp_id;
 414       ptrdiff_t ignore, end;
 415
 416       if (find_composition (i, -1, &ignore, &end, &val, string)
 417           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 418               >= 0))
 419         {
 420           thiswidth = composition_table[cmp_id]->width;
 421           chars = end - i;
 422           bytes = string_char_to_byte (string, end) - i_byte;
 423         }
 424       else
 425         {
 426           int c;
 427
 428           if (multibyte)
 429             {
 430               int cbytes;
 431               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 432               bytes = cbytes;
 433             }
 434           else
 435             c = str[i_byte], bytes = 1;
 436           chars = 1;
 437           thiswidth = char_width (c, dp);
 438         }
 439
 440       if (precision <= 0)
 441         {
 442 #ifdef emacs
 443           if (INT_ADD_OVERFLOW (width, thiswidth))
 444             string_overflow ();
 445 #endif
 446         }
 447       else if (precision - width < thiswidth)
 448         {
 449           *nchars = i;
 450           *nbytes = i_byte;
 451           return width;
 452         }
 453       i += chars;
 454       i_byte += bytes;
 455       width += thiswidth;
 456     }
 457
 458   if (precision > 0)
 459     {
 460       *nchars = i;
 461       *nbytes = i_byte;
 462     }
 463
 464   return width;
 465 }
 466
 467 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 468        doc: /* Return width of STRING when displayed in the current buffer.
 469 Width is measured by how many columns it occupies on the screen.
 470 When calculating width of a multibyte character in STRING,
 471 only the base leading-code is considered; the validity of
 472 the following bytes is not checked.  Tabs in STRING are always
 473 taken to occupy `tab-width' columns.
 474 usage: (string-width STRING)  */)
 475   (Lisp_Object str)
 476 {
 477   Lisp_Object val;
 478
 479   CHECK_STRING (str);
 480   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 481   return val;
 482 }
 483
 484 /* Return the number of characters in the NBYTES bytes at PTR.
 485    This works by looking at the contents and checking for multibyte
 486    sequences while assuming that there's no invalid sequence.
 487    However, if the current buffer has enable-multibyte-characters =
 488    nil, we treat each byte as a character.  */
 489
 490 ptrdiff_t
 491 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 492 {
 493   /* current_buffer is null at early stages of Emacs initialization.  */
 494   if (current_buffer == 0
 495       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 496     return nbytes;
 497
 498   return multibyte_chars_in_text (ptr, nbytes);
 499 }
 500
 501 /* Return the number of characters in the NBYTES bytes at PTR.
 502    This works by looking at the contents and checking for multibyte
 503    sequences while assuming that there's no invalid sequence.  It
 504    ignores enable-multibyte-characters.  */
 505
 506 ptrdiff_t
 507 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 508 {
 509   const unsigned char *endp = ptr + nbytes;
 510   ptrdiff_t chars = 0;
 511
 512   while (ptr < endp)
 513     {
 514       int len = MULTIBYTE_LENGTH (ptr, endp);
 515
 516       if (len == 0)
 517         emacs_abort ();
 518       ptr += len;
 519       chars++;
 520     }
 521
 522   return chars;
 523 }
 524
 525 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 526    characters and bytes in it, and store them in *NCHARS and *NBYTES
 527    respectively.  On counting bytes, pay attention to that 8-bit
 528    characters not constructing a valid multibyte sequence are
 529    represented by 2-byte in a multibyte text.  */
 530
 531 void
 532 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 533                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 534 {
 535   const unsigned char *endp = str + len;
 536   int n;
 537   ptrdiff_t chars = 0, bytes = 0;
 538
 539   if (len >= MAX_MULTIBYTE_LENGTH)
 540     {
 541       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 542       while (str < adjusted_endp)
 543         {
 544           if (! CHAR_BYTE8_HEAD_P (*str)
 545               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 546             str += n, bytes += n;
 547           else
 548             str++, bytes += 2;
 549           chars++;
 550         }
 551     }
 552   while (str < endp)
 553     {
 554       if (! CHAR_BYTE8_HEAD_P (*str)
 555           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 556         str += n, bytes += n;
 557       else
 558         str++, bytes += 2;
 559       chars++;
 560     }
 561
 562   *nchars = chars;
 563   *nbytes = bytes;
 564   return;
 565 }
 566
 567 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 568    It actually converts only such 8-bit characters that don't construct
 569    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 570    NCHARS is nonzero, set *NCHARS to the number of characters in the
 571    text.  It is assured that we can use LEN bytes at STR as a work
 572    area and that is enough.  Return the number of bytes of the
 573    resulting text.  */
 574
 575 ptrdiff_t
 576 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 577                   ptrdiff_t *nchars)
 578 {
 579   unsigned char *p = str, *endp = str + nbytes;
 580   unsigned char *to;
 581   ptrdiff_t chars = 0;
 582   int n;
 583
 584   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 585     {
 586       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 587       while (p < adjusted_endp
 588              && ! CHAR_BYTE8_HEAD_P (*p)
 589              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 590         p += n, chars++;
 591     }
 592   while (p < endp
 593          && ! CHAR_BYTE8_HEAD_P (*p)
 594          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 595     p += n, chars++;
 596   if (nchars)
 597     *nchars = chars;
 598   if (p == endp)
 599     return nbytes;
 600
 601   to = p;
 602   nbytes = endp - p;
 603   endp = str + len;
 604   memmove (endp - nbytes, p, nbytes);
 605   p = endp - nbytes;
 606
 607   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 608     {
 609       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 610       while (p < adjusted_endp)
 611         {
 612           if (! CHAR_BYTE8_HEAD_P (*p)
 613               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 614             {
 615               while (n--)
 616                 *to++ = *p++;
 617             }
 618           else
 619             {
 620               int c = *p++;
 621               c = BYTE8_TO_CHAR (c);
 622               to += CHAR_STRING (c, to);
 623             }
 624         }
 625       chars++;
 626     }
 627   while (p < endp)
 628     {
 629       if (! CHAR_BYTE8_HEAD_P (*p)
 630           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 631         {
 632           while (n--)
 633             *to++ = *p++;
 634         }
 635       else
 636         {
 637           int c = *p++;
 638           c = BYTE8_TO_CHAR (c);
 639           to += CHAR_STRING (c, to);
 640         }
 641       chars++;
 642     }
 643   if (nchars)
 644     *nchars = chars;
 645   return (to - str);
 646 }
 647
 648 /* Parse unibyte string at STR of LEN bytes, and return the number of
 649    bytes it may occupy when converted to multibyte string by
 650    `str_to_multibyte'.  */
 651
 652 ptrdiff_t
 653 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 654 {
 655   const unsigned char *endp = str + len;
 656   ptrdiff_t bytes;
 657
 658   for (bytes = 0; str < endp; str++)
 659     {
 660       int n = *str < 0x80 ? 1 : 2;
 661       if (INT_ADD_OVERFLOW (bytes, n))
 662         string_overflow ();
 663       bytes += n;
 664     }
 665   return bytes;
 666 }
 667
 668
 669 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 670    that contains the same single-byte characters.  It actually
 671    converts all 8-bit characters to multibyte forms.  It is assured
 672    that we can use LEN bytes at STR as a work area and that is
 673    enough.  */
 674
 675 ptrdiff_t
 676 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 677 {
 678   unsigned char *p = str, *endp = str + bytes;
 679   unsigned char *to;
 680
 681   while (p < endp && *p < 0x80) p++;
 682   if (p == endp)
 683     return bytes;
 684   to = p;
 685   bytes = endp - p;
 686   endp = str + len;
 687   memmove (endp - bytes, p, bytes);
 688   p = endp - bytes;
 689   while (p < endp)
 690     {
 691       int c = *p++;
 692
 693       if (c >= 0x80)
 694         c = BYTE8_TO_CHAR (c);
 695       to += CHAR_STRING (c, to);
 696     }
 697   return (to - str);
 698 }
 699
 700 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 701    actually converts characters in the range 0x80..0xFF to
 702    unibyte.  */
 703
 704 ptrdiff_t
 705 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 706 {
 707   const unsigned char *p = str, *endp = str + bytes;
 708   unsigned char *to;
 709   int c, len;
 710
 711   while (p < endp)
 712     {
 713       c = *p;
 714       len = BYTES_BY_CHAR_HEAD (c);
 715       if (CHAR_BYTE8_HEAD_P (c))
 716         break;
 717       p += len;
 718     }
 719   to = str + (p - str);
 720   while (p < endp)
 721     {
 722       c = *p;
 723       len = BYTES_BY_CHAR_HEAD (c);
 724       if (CHAR_BYTE8_HEAD_P (c))
 725         {
 726           c = STRING_CHAR_ADVANCE (p);
 727           *to++ = CHAR_TO_BYTE8 (c);
 728         }
 729       else
 730         {
 731           while (len--) *to++ = *p++;
 732         }
 733     }
 734   return (to - str);
 735 }
 736
 737 /* Convert eight-bit chars in SRC (in multibyte form) to the
 738    corresponding byte and store in DST.  CHARS is the number of
 739    characters in SRC.  The value is the number of bytes stored in DST.
 740    Usually, the value is the same as CHARS, but is less than it if SRC
 741    contains a non-ASCII, non-eight-bit character.  */
 742
 743 ptrdiff_t
 744 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 745 {
 746   ptrdiff_t i;
 747
 748   for (i = 0; i < chars; i++)
 749     {
 750       int c = STRING_CHAR_ADVANCE (src);
 751
 752       if (CHAR_BYTE8_P (c))
 753         c = CHAR_TO_BYTE8 (c);
 754       else if (! ASCII_CHAR_P (c))
 755         return i;
 756       *dst++ = c;
 757     }
 758   return i;
 759 }
 760
 761
 762 static ptrdiff_t
 763 string_count_byte8 (Lisp_Object string)
 764 {
 765   bool multibyte = STRING_MULTIBYTE (string);
 766   ptrdiff_t nbytes = SBYTES (string);
 767   unsigned char *p = SDATA (string);
 768   unsigned char *pend = p + nbytes;
 769   ptrdiff_t count = 0;
 770   int c, len;
 771
 772   if (multibyte)
 773     while (p < pend)
 774       {
 775         c = *p;
 776         len = BYTES_BY_CHAR_HEAD (c);
 777
 778         if (CHAR_BYTE8_HEAD_P (c))
 779           count++;
 780         p += len;
 781       }
 782   else
 783     while (p < pend)
 784       {
 785         if (*p++ >= 0x80)
 786           count++;
 787       }
 788   return count;
 789 }
 790
 791
 792 Lisp_Object
 793 string_escape_byte8 (Lisp_Object string)
 794 {
 795   ptrdiff_t nchars = SCHARS (string);
 796   ptrdiff_t nbytes = SBYTES (string);
 797   bool multibyte = STRING_MULTIBYTE (string);
 798   ptrdiff_t byte8_count;
 799   const unsigned char *src, *src_end;
 800   unsigned char *dst;
 801   Lisp_Object val;
 802   int c, len;
 803
 804   if (multibyte && nchars == nbytes)
 805     return string;
 806
 807   byte8_count = string_count_byte8 (string);
 808
 809   if (byte8_count == 0)
 810     return string;
 811
 812   if (multibyte)
 813     {
 814       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 815           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 816         string_overflow ();
 817
 818       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 819       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 820                                           nbytes + byte8_count * 2);
 821     }
 822   else
 823     {
 824       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 825         string_overflow ();
 826
 827       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 828       val = make_uninit_string (nbytes + byte8_count * 3);
 829     }
 830
 831   src = SDATA (string);
 832   src_end = src + nbytes;
 833   dst = SDATA (val);
 834   if (multibyte)
 835     while (src < src_end)
 836       {
 837         c = *src;
 838         len = BYTES_BY_CHAR_HEAD (c);
 839
 840         if (CHAR_BYTE8_HEAD_P (c))
 841           {
 842             c = STRING_CHAR_ADVANCE (src);
 843             c = CHAR_TO_BYTE8 (c);
 844             dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 845           }
 846         else
 847           while (len--) *dst++ = *src++;
 848       }
 849   else
 850     while (src < src_end)
 851       {
 852         c = *src++;
 853         if (c >= 0x80)
 854           dst += sprintf ((char *) dst, "\\%03o", c + 0u);
 855         else
 856           *dst++ = c;
 857       }
 858   return val;
 859 }
 860
 861 \f
 862 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 863        doc: /*
 864 Concatenate all the argument characters and make the result a string.
 865 usage: (string &rest CHARACTERS)  */)
 866   (ptrdiff_t n, Lisp_Object *args)
 867 {
 868   ptrdiff_t i;
 869   int c;
 870   unsigned char *buf, *p;
 871   Lisp_Object str;
 872   USE_SAFE_ALLOCA;
 873
 874   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 875   p = buf;
 876
 877   for (i = 0; i < n; i++)
 878     {
 879       CHECK_CHARACTER (args[i]);
 880       c = XINT (args[i]);
 881       p += CHAR_STRING (c, p);
 882     }
 883
 884   str = make_string_from_bytes ((char *) buf, n, p - buf);
 885   SAFE_FREE ();
 886   return str;
 887 }
 888
 889 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 890        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 891 usage: (unibyte-string &rest BYTES)  */)
 892   (ptrdiff_t n, Lisp_Object *args)
 893 {
 894   ptrdiff_t i;
 895   Lisp_Object str;
 896   USE_SAFE_ALLOCA;
 897   unsigned char *buf = SAFE_ALLOCA (n);
 898   unsigned char *p = buf;
 899
 900   for (i = 0; i < n; i++)
 901     {
 902       CHECK_RANGED_INTEGER (args[i], 0, 255);
 903       *p++ = XINT (args[i]);
 904     }
 905
 906   str = make_string_from_bytes ((char *) buf, n, p - buf);
 907   SAFE_FREE ();
 908   return str;
 909 }
 910
 911 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 912        Schar_resolve_modifiers, 1, 1, 0,
 913        doc: /* Resolve modifiers in the character CHAR.
 914 The value is a character with modifiers resolved into the character
 915 code.  Unresolved modifiers are kept in the value.
 916 usage: (char-resolve-modifiers CHAR)  */)
 917   (Lisp_Object character)
 918 {
 919   EMACS_INT c;
 920
 921   CHECK_NUMBER (character);
 922   c = XINT (character);
 923   return make_number (char_resolve_modifier_mask (c));
 924 }
 925
 926 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 927        doc: /* Return a byte value of a character at point.
 928 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 929 a byte value.
 930 Optional 2nd arg STRING, if non-nil, is a string of which first
 931 character is a target to get a byte value.  In this case, POSITION, if
 932 non-nil, is an index of a target character in the string.
 933
 934 If the current buffer (or STRING) is multibyte, and the target
 935 character is not ASCII nor 8-bit character, an error is signaled.  */)
 936   (Lisp_Object position, Lisp_Object string)
 937 {
 938   int c;
 939   ptrdiff_t pos;
 940   unsigned char *p;
 941
 942   if (NILP (string))
 943     {
 944       if (NILP (position))
 945         {
 946           p = PT_ADDR;
 947         }
 948       else
 949         {
 950           CHECK_NUMBER_COERCE_MARKER (position);
 951           if (XINT (position) < BEGV || XINT (position) >= ZV)
 952             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 953           pos = XFASTINT (position);
 954           p = CHAR_POS_ADDR (pos);
 955         }
 956       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 957         return make_number (*p);
 958     }
 959   else
 960     {
 961       CHECK_STRING (string);
 962       if (NILP (position))
 963         {
 964           p = SDATA (string);
 965         }
 966       else
 967         {
 968           CHECK_NATNUM (position);
 969           if (XINT (position) >= SCHARS (string))
 970             args_out_of_range (string, position);
 971           pos = XFASTINT (position);
 972           p = SDATA (string) + string_char_to_byte (string, pos);
 973         }
 974       if (! STRING_MULTIBYTE (string))
 975         return make_number (*p);
 976     }
 977   c = STRING_CHAR (p);
 978   if (CHAR_BYTE8_P (c))
 979     c = CHAR_TO_BYTE8 (c);
 980   else if (! ASCII_CHAR_P (c))
 981     error ("Not an ASCII nor an 8-bit character: %d", c);
 982   return make_number (c);
 983 }
 984
 985 #ifdef emacs
 986
 987 /* Return true if C is an alphabetic character.  */
 988 bool
 989 alphabeticp (int c)
 990 {
 991   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
 992   if (! INTEGERP (category))
 993     return false;
 994   EMACS_INT gen_cat = XINT (category);
 995
 996   /* See UTS #18.  There are additional characters that should be
 997      here, those designated as Other_uppercase, Other_lowercase,
 998      and Other_alphabetic; FIXME.  */
 999   return (gen_cat == UNICODE_CATEGORY_Lu
1000           || gen_cat == UNICODE_CATEGORY_Ll
1001           || gen_cat == UNICODE_CATEGORY_Lt
1002           || gen_cat == UNICODE_CATEGORY_Lm
1003           || gen_cat == UNICODE_CATEGORY_Lo
1004           || gen_cat == UNICODE_CATEGORY_Mn
1005           || gen_cat == UNICODE_CATEGORY_Mc
1006           || gen_cat == UNICODE_CATEGORY_Me
1007           || gen_cat == UNICODE_CATEGORY_Nl);
1008 }
1009
1010 /* Return true if C is a decimal-number character.  */
1011 bool
1012 decimalnump (int c)
1013 {
1014   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1015   if (! INTEGERP (category))
1016     return false;
1017   EMACS_INT gen_cat = XINT (category);
1018
1019   /* See UTS #18.  */
1020   return gen_cat == UNICODE_CATEGORY_Nd;
1021 }
1022
1023 /* Return true if C is a graphic character.  */
1024 bool
1025 graphicp (int c)
1026 {
1027   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1028   if (! INTEGERP (category))
1029     return false;
1030   EMACS_INT gen_cat = XINT (category);
1031
1032   /* See UTS #18.  */
1033   return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
1034             || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
1035             || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
1036             || gen_cat == UNICODE_CATEGORY_Cc /* control */
1037             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1038             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1039 }
1040
1041 /* Return true if C is a printable character.  */
1042 bool
1043 printablep (int c)
1044 {
1045   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1046   if (! INTEGERP (category))
1047     return false;
1048   EMACS_INT gen_cat = XINT (category);
1049
1050   /* See UTS #18.  */
1051   return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
1052             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1053             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1054 }
1055
1056 void
1057 syms_of_character (void)
1058 {
1059   DEFSYM (Qcharacterp, "characterp");
1060   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1061
1062   staticpro (&Vchar_unify_table);
1063   Vchar_unify_table = Qnil;
1064
1065   defsubr (&Smax_char);
1066   defsubr (&Scharacterp);
1067   defsubr (&Sunibyte_char_to_multibyte);
1068   defsubr (&Smultibyte_char_to_unibyte);
1069   defsubr (&Schar_width);
1070   defsubr (&Sstring_width);
1071   defsubr (&Sstring);
1072   defsubr (&Sunibyte_string);
1073   defsubr (&Schar_resolve_modifiers);
1074   defsubr (&Sget_byte);
1075
1076   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1077                doc: /*
1078 Vector recording all translation tables ever defined.
1079 Each element is a pair (SYMBOL . TABLE) relating the table to the
1080 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1081   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1082
1083   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1084                doc: /*
1085 A char-table for characters which invoke auto-filling.
1086 Such characters have value t in this table.  */);
1087   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1088   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1089   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1090
1091   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1092                doc: /*
1093 A char-table for width (columns) of each character.  */);
1094   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1095   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1096   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1097                         make_number (4));
1098
1099   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1100                doc: /* A char-table for each printable character.  */);
1101   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1102   Fset_char_table_range (Vprintable_chars,
1103                          Fcons (make_number (32), make_number (126)), Qt);
1104   Fset_char_table_range (Vprintable_chars,
1105                          Fcons (make_number (160),
1106                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1107
1108   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1109                doc: /* Char table of script symbols.
1110 It has one extra slot whose value is a list of script symbols.  */);
1111
1112   DEFSYM (Qchar_script_table, "char-script-table");
1113   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1114   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1115
1116   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1117                doc: /* Alist of scripts vs the representative characters.
1118 Each element is a cons (SCRIPT . CHARS).
1119 SCRIPT is a symbol representing a script or a subgroup of a script.
1120 CHARS is a list or a vector of characters.
1121 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1122 If it is a vector, one of the characters in the vector is necessary.
1123 This variable is used to find a font for a specific script.  */);
1124   Vscript_representative_chars = Qnil;
1125
1126   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1127                doc: /* Char table of Unicode's "General Category".
1128 All Unicode characters have one of the following values (symbol):
1129   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1130   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1131 See The Unicode Standard for the meaning of those values.  */);
1132   /* The correct char-table is setup in characters.el.  */
1133   Vunicode_category_table = Qnil;
1134 }
1135
1136 #endif /* emacs */