src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2015 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <intprops.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 /* Char-table of information about which character to unify to which
  52    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  53 Lisp_Object Vchar_unify_table;
  54
  55 \f
  56
  57 /* If character code C has modifier masks, reflect them to the
  58    character code if possible.  Return the resulting code.  */
  59
  60 EMACS_INT
  61 char_resolve_modifier_mask (EMACS_INT c)
  62 {
  63   /* A non-ASCII character can't reflect modifier bits to the code.  */
  64   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  65     return c;
  66
  67   /* For Meta, Shift, and Control modifiers, we need special care.  */
  68   if (c & CHAR_SHIFT)
  69     {
  70       /* Shift modifier is valid only with [A-Za-z].  */
  71       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  72         c &= ~CHAR_SHIFT;
  73       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  74         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  75       /* Shift modifier for control characters and SPC is ignored.  */
  76       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  77         c &= ~CHAR_SHIFT;
  78     }
  79   if (c & CHAR_CTL)
  80     {
  81       /* Simulate the code in lread.c.  */
  82       /* Allow `\C- ' and `\C-?'.  */
  83       if ((c & 0377) == ' ')
  84         c &= ~0177 & ~ CHAR_CTL;
  85       else if ((c & 0377) == '?')
  86         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  87       /* ASCII control chars are made from letters (both cases),
  88          as well as the non-letters within 0100...0137.  */
  89       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  90         c &= (037 | (~0177 & ~CHAR_CTL));
  91       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  92         c &= (037 | (~0177 & ~CHAR_CTL));
  93     }
  94 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
  95   if (c & CHAR_META)
  96     {
  97       /* Move the meta bit to the right place for a string.  */
  98       c = (c & ~CHAR_META) | 0x80;
  99     }
 100 #endif
 101
 102   return c;
 103 }
 104
 105
 106 /* Store multibyte form of character C at P.  If C has modifier bits,
 107    handle them appropriately.  */
 108
 109 int
 110 char_string (unsigned int c, unsigned char *p)
 111 {
 112   int bytes;
 113
 114   if (c & CHAR_MODIFIER_MASK)
 115     {
 116       c = char_resolve_modifier_mask (c);
 117       /* If C still has any modifier bits, just ignore it.  */
 118       c &= ~CHAR_MODIFIER_MASK;
 119     }
 120
 121   if (c <= MAX_3_BYTE_CHAR)
 122     {
 123       bytes = CHAR_STRING (c, p);
 124     }
 125   else if (c <= MAX_4_BYTE_CHAR)
 126     {
 127       p[0] = (0xF0 | (c >> 18));
 128       p[1] = (0x80 | ((c >> 12) & 0x3F));
 129       p[2] = (0x80 | ((c >> 6) & 0x3F));
 130       p[3] = (0x80 | (c & 0x3F));
 131       bytes = 4;
 132     }
 133   else if (c <= MAX_5_BYTE_CHAR)
 134     {
 135       p[0] = 0xF8;
 136       p[1] = (0x80 | ((c >> 18) & 0x0F));
 137       p[2] = (0x80 | ((c >> 12) & 0x3F));
 138       p[3] = (0x80 | ((c >> 6) & 0x3F));
 139       p[4] = (0x80 | (c & 0x3F));
 140       bytes = 5;
 141     }
 142   else if (c <= MAX_CHAR)
 143     {
 144       c = CHAR_TO_BYTE8 (c);
 145       bytes = BYTE8_STRING (c, p);
 146     }
 147   else
 148     error ("Invalid character: %x", c);
 149
 150   return bytes;
 151 }
 152
 153
 154 /* Return a character whose multibyte form is at P.  If LEN is not
 155    NULL, it must be a pointer to integer.  In that case, set *LEN to
 156    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 157    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 158    the ending address (i.e., the starting address of the next
 159    character) of the multibyte form.  */
 160
 161 int
 162 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 163 {
 164   int c;
 165   const unsigned char *saved_p = p;
 166
 167   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 168     {
 169       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 170       c = STRING_CHAR_ADVANCE (p);
 171     }
 172   else if (! (*p & 0x08))
 173     {
 174       /* A 4-byte sequence of this form:
 175          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 176       c = ((((p)[0] & 0x7) << 18)
 177            | (((p)[1] & 0x3F) << 12)
 178            | (((p)[2] & 0x3F) << 6)
 179            | ((p)[3] & 0x3F));
 180       p += 4;
 181     }
 182   else
 183     {
 184       /* A 5-byte sequence of this form:
 185
 186          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 187
 188          Note that the top 4 `x's are always 0, so shifting p[1] can
 189          never exceed the maximum valid character codepoint. */
 190       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 191            (((p)[1] & 0x3F) << 18)
 192            | (((p)[2] & 0x3F) << 12)
 193            | (((p)[3] & 0x3F) << 6)
 194            | ((p)[4] & 0x3F));
 195       p += 5;
 196     }
 197
 198   if (len)
 199     *len = p - saved_p;
 200   if (advanced)
 201     *advanced = p;
 202   return c;
 203 }
 204
 205
 206 /* Translate character C by translation table TABLE.  If no translation is
 207    found in TABLE, return the untranslated character.  If TABLE is a list,
 208    elements are char tables.  In that case, recursively translate C by all the
 209    tables in the list.  */
 210
 211 int
 212 translate_char (Lisp_Object table, int c)
 213 {
 214   if (CHAR_TABLE_P (table))
 215     {
 216       Lisp_Object ch;
 217
 218       ch = CHAR_TABLE_REF (table, c);
 219       if (CHARACTERP (ch))
 220         c = XINT (ch);
 221     }
 222   else
 223     {
 224       for (; CONSP (table); table = XCDR (table))
 225         c = translate_char (XCAR (table), c);
 226     }
 227   return c;
 228 }
 229
 230 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 231        doc: /* Return non-nil if OBJECT is a character.
 232 In Emacs Lisp, characters are represented by character codes, which
 233 are non-negative integers.  The function `max-char' returns the
 234 maximum character code.
 235 usage: (characterp OBJECT)  */)
 236   (Lisp_Object object, Lisp_Object ignore)
 237 {
 238   return (CHARACTERP (object) ? Qt : Qnil);
 239 }
 240
 241 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 242        doc: /* Return the character of the maximum code.  */)
 243   (void)
 244 {
 245   return make_number (MAX_CHAR);
 246 }
 247
 248 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 249        Sunibyte_char_to_multibyte, 1, 1, 0,
 250        doc: /* Convert the byte CH to multibyte character.  */)
 251   (Lisp_Object ch)
 252 {
 253   int c;
 254
 255   CHECK_CHARACTER (ch);
 256   c = XFASTINT (ch);
 257   if (c >= 0x100)
 258     error ("Not a unibyte character: %d", c);
 259   MAKE_CHAR_MULTIBYTE (c);
 260   return make_number (c);
 261 }
 262
 263 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 264        Smultibyte_char_to_unibyte, 1, 1, 0,
 265        doc: /* Convert the multibyte character CH to a byte.
 266 If the multibyte character does not represent a byte, return -1.  */)
 267   (Lisp_Object ch)
 268 {
 269   int cm;
 270
 271   CHECK_CHARACTER (ch);
 272   cm = XFASTINT (ch);
 273   if (cm < 256)
 274     /* Can't distinguish a byte read from a unibyte buffer from
 275        a latin1 char, so let's let it slide.  */
 276     return ch;
 277   else
 278     {
 279       int cu = CHAR_TO_BYTE_SAFE (cm);
 280       return make_number (cu);
 281     }
 282 }
 283
 284
 285 /* Return width (columns) of C considering the buffer display table DP. */
 286
 287 static ptrdiff_t
 288 char_width (int c, struct Lisp_Char_Table *dp)
 289 {
 290   ptrdiff_t width = CHAR_WIDTH (c);
 291
 292   if (dp)
 293     {
 294       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 295       int i;
 296
 297       if (VECTORP (disp))
 298         for (i = 0, width = 0; i < ASIZE (disp); i++)
 299           {
 300             ch = AREF (disp, i);
 301             if (CHARACTERP (ch))
 302               {
 303                 int w = CHAR_WIDTH (XFASTINT (ch));
 304                 if (INT_ADD_OVERFLOW (width, w))
 305                   string_overflow ();
 306                 width += w;
 307               }
 308           }
 309     }
 310   return width;
 311 }
 312
 313
 314 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 315        doc: /* Return width of CHAR when displayed in the current buffer.
 316 The width is measured by how many columns it occupies on the screen.
 317 Tab is taken to occupy `tab-width' columns.
 318 usage: (char-width CHAR)  */)
 319   (Lisp_Object ch)
 320 {
 321   int c;
 322   ptrdiff_t width;
 323
 324   CHECK_CHARACTER (ch);
 325   c = XINT (ch);
 326   width = char_width (c, buffer_display_table ());
 327   return make_number (width);
 328 }
 329
 330 /* Return width of string STR of length LEN when displayed in the
 331    current buffer.  The width is measured by how many columns it
 332    occupies on the screen.  If PRECISION > 0, return the width of
 333    longest substring that doesn't exceed PRECISION, and set number of
 334    characters and bytes of the substring in *NCHARS and *NBYTES
 335    respectively.  */
 336
 337 ptrdiff_t
 338 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 339                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 340 {
 341   ptrdiff_t i = 0, i_byte = 0;
 342   ptrdiff_t width = 0;
 343   struct Lisp_Char_Table *dp = buffer_display_table ();
 344
 345   while (i_byte < len)
 346     {
 347       int bytes;
 348       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 349       ptrdiff_t thiswidth = char_width (c, dp);
 350
 351       if (precision <= 0)
 352         {
 353           if (INT_ADD_OVERFLOW (width, thiswidth))
 354             string_overflow ();
 355         }
 356       else if (precision - width < thiswidth)
 357         {
 358           *nchars = i;
 359           *nbytes = i_byte;
 360           return width;
 361         }
 362       i++;
 363       i_byte += bytes;
 364       width += thiswidth;
 365   }
 366
 367   if (precision > 0)
 368     {
 369       *nchars = i;
 370       *nbytes = i_byte;
 371     }
 372
 373   return width;
 374 }
 375
 376 /* Return width of string STR of length LEN when displayed in the
 377    current buffer.  The width is measured by how many columns it
 378    occupies on the screen.  */
 379
 380 ptrdiff_t
 381 strwidth (const char *str, ptrdiff_t len)
 382 {
 383   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 384 }
 385
 386 /* Return width of Lisp string STRING when displayed in the current
 387    buffer.  The width is measured by how many columns it occupies on
 388    the screen while paying attention to compositions.  If PRECISION >
 389    0, return the width of longest substring that doesn't exceed
 390    PRECISION, and set number of characters and bytes of the substring
 391    in *NCHARS and *NBYTES respectively.  */
 392
 393 ptrdiff_t
 394 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 395                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 396 {
 397   ptrdiff_t len = SCHARS (string);
 398   /* This set multibyte to 0 even if STRING is multibyte when it
 399      contains only ascii and eight-bit-graphic, but that's
 400      intentional.  */
 401   bool multibyte = len < SBYTES (string);
 402   unsigned char *str = SDATA (string);
 403   ptrdiff_t i = 0, i_byte = 0;
 404   ptrdiff_t width = 0;
 405   struct Lisp_Char_Table *dp = buffer_display_table ();
 406
 407   while (i < len)
 408     {
 409       ptrdiff_t chars, bytes, thiswidth;
 410       Lisp_Object val;
 411       ptrdiff_t cmp_id;
 412       ptrdiff_t ignore, end;
 413
 414       if (find_composition (i, -1, &ignore, &end, &val, string)
 415           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 416               >= 0))
 417         {
 418           thiswidth = composition_table[cmp_id]->width;
 419           chars = end - i;
 420           bytes = string_char_to_byte (string, end) - i_byte;
 421         }
 422       else
 423         {
 424           int c;
 425
 426           if (multibyte)
 427             {
 428               int cbytes;
 429               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 430               bytes = cbytes;
 431             }
 432           else
 433             c = str[i_byte], bytes = 1;
 434           chars = 1;
 435           thiswidth = char_width (c, dp);
 436         }
 437
 438       if (precision <= 0)
 439         {
 440 #ifdef emacs
 441           if (INT_ADD_OVERFLOW (width, thiswidth))
 442             string_overflow ();
 443 #endif
 444         }
 445       else if (precision - width < thiswidth)
 446         {
 447           *nchars = i;
 448           *nbytes = i_byte;
 449           return width;
 450         }
 451       i += chars;
 452       i_byte += bytes;
 453       width += thiswidth;
 454     }
 455
 456   if (precision > 0)
 457     {
 458       *nchars = i;
 459       *nbytes = i_byte;
 460     }
 461
 462   return width;
 463 }
 464
 465 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 466        doc: /* Return width of STRING when displayed in the current buffer.
 467 Width is measured by how many columns it occupies on the screen.
 468 When calculating width of a multibyte character in STRING,
 469 only the base leading-code is considered; the validity of
 470 the following bytes is not checked.  Tabs in STRING are always
 471 taken to occupy `tab-width' columns.
 472 usage: (string-width STRING)  */)
 473   (Lisp_Object str)
 474 {
 475   Lisp_Object val;
 476
 477   CHECK_STRING (str);
 478   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 479   return val;
 480 }
 481
 482 /* Return the number of characters in the NBYTES bytes at PTR.
 483    This works by looking at the contents and checking for multibyte
 484    sequences while assuming that there's no invalid sequence.
 485    However, if the current buffer has enable-multibyte-characters =
 486    nil, we treat each byte as a character.  */
 487
 488 ptrdiff_t
 489 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 490 {
 491   /* current_buffer is null at early stages of Emacs initialization.  */
 492   if (current_buffer == 0
 493       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 494     return nbytes;
 495
 496   return multibyte_chars_in_text (ptr, nbytes);
 497 }
 498
 499 /* Return the number of characters in the NBYTES bytes at PTR.
 500    This works by looking at the contents and checking for multibyte
 501    sequences while assuming that there's no invalid sequence.  It
 502    ignores enable-multibyte-characters.  */
 503
 504 ptrdiff_t
 505 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 506 {
 507   const unsigned char *endp = ptr + nbytes;
 508   ptrdiff_t chars = 0;
 509
 510   while (ptr < endp)
 511     {
 512       int len = MULTIBYTE_LENGTH (ptr, endp);
 513
 514       if (len == 0)
 515         emacs_abort ();
 516       ptr += len;
 517       chars++;
 518     }
 519
 520   return chars;
 521 }
 522
 523 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 524    characters and bytes in it, and store them in *NCHARS and *NBYTES
 525    respectively.  On counting bytes, pay attention to that 8-bit
 526    characters not constructing a valid multibyte sequence are
 527    represented by 2-byte in a multibyte text.  */
 528
 529 void
 530 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 531                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 532 {
 533   const unsigned char *endp = str + len;
 534   int n;
 535   ptrdiff_t chars = 0, bytes = 0;
 536
 537   if (len >= MAX_MULTIBYTE_LENGTH)
 538     {
 539       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 540       while (str < adjusted_endp)
 541         {
 542           if (! CHAR_BYTE8_HEAD_P (*str)
 543               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 544             str += n, bytes += n;
 545           else
 546             str++, bytes += 2;
 547           chars++;
 548         }
 549     }
 550   while (str < endp)
 551     {
 552       if (! CHAR_BYTE8_HEAD_P (*str)
 553           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 554         str += n, bytes += n;
 555       else
 556         str++, bytes += 2;
 557       chars++;
 558     }
 559
 560   *nchars = chars;
 561   *nbytes = bytes;
 562   return;
 563 }
 564
 565 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 566    It actually converts only such 8-bit characters that don't construct
 567    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 568    NCHARS is nonzero, set *NCHARS to the number of characters in the
 569    text.  It is assured that we can use LEN bytes at STR as a work
 570    area and that is enough.  Return the number of bytes of the
 571    resulting text.  */
 572
 573 ptrdiff_t
 574 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 575                   ptrdiff_t *nchars)
 576 {
 577   unsigned char *p = str, *endp = str + nbytes;
 578   unsigned char *to;
 579   ptrdiff_t chars = 0;
 580   int n;
 581
 582   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 583     {
 584       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 585       while (p < adjusted_endp
 586              && ! CHAR_BYTE8_HEAD_P (*p)
 587              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 588         p += n, chars++;
 589     }
 590   while (p < endp
 591          && ! CHAR_BYTE8_HEAD_P (*p)
 592          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 593     p += n, chars++;
 594   if (nchars)
 595     *nchars = chars;
 596   if (p == endp)
 597     return nbytes;
 598
 599   to = p;
 600   nbytes = endp - p;
 601   endp = str + len;
 602   memmove (endp - nbytes, p, nbytes);
 603   p = endp - nbytes;
 604
 605   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 606     {
 607       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 608       while (p < adjusted_endp)
 609         {
 610           if (! CHAR_BYTE8_HEAD_P (*p)
 611               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 612             {
 613               while (n--)
 614                 *to++ = *p++;
 615             }
 616           else
 617             {
 618               int c = *p++;
 619               c = BYTE8_TO_CHAR (c);
 620               to += CHAR_STRING (c, to);
 621             }
 622         }
 623       chars++;
 624     }
 625   while (p < endp)
 626     {
 627       if (! CHAR_BYTE8_HEAD_P (*p)
 628           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 629         {
 630           while (n--)
 631             *to++ = *p++;
 632         }
 633       else
 634         {
 635           int c = *p++;
 636           c = BYTE8_TO_CHAR (c);
 637           to += CHAR_STRING (c, to);
 638         }
 639       chars++;
 640     }
 641   if (nchars)
 642     *nchars = chars;
 643   return (to - str);
 644 }
 645
 646 /* Parse unibyte string at STR of LEN bytes, and return the number of
 647    bytes it may occupy when converted to multibyte string by
 648    `str_to_multibyte'.  */
 649
 650 ptrdiff_t
 651 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 652 {
 653   const unsigned char *endp = str + len;
 654   ptrdiff_t bytes;
 655
 656   for (bytes = 0; str < endp; str++)
 657     {
 658       int n = *str < 0x80 ? 1 : 2;
 659       if (INT_ADD_OVERFLOW (bytes, n))
 660         string_overflow ();
 661       bytes += n;
 662     }
 663   return bytes;
 664 }
 665
 666
 667 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 668    that contains the same single-byte characters.  It actually
 669    converts all 8-bit characters to multibyte forms.  It is assured
 670    that we can use LEN bytes at STR as a work area and that is
 671    enough.  */
 672
 673 ptrdiff_t
 674 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 675 {
 676   unsigned char *p = str, *endp = str + bytes;
 677   unsigned char *to;
 678
 679   while (p < endp && *p < 0x80) p++;
 680   if (p == endp)
 681     return bytes;
 682   to = p;
 683   bytes = endp - p;
 684   endp = str + len;
 685   memmove (endp - bytes, p, bytes);
 686   p = endp - bytes;
 687   while (p < endp)
 688     {
 689       int c = *p++;
 690
 691       if (c >= 0x80)
 692         c = BYTE8_TO_CHAR (c);
 693       to += CHAR_STRING (c, to);
 694     }
 695   return (to - str);
 696 }
 697
 698 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 699    actually converts characters in the range 0x80..0xFF to
 700    unibyte.  */
 701
 702 ptrdiff_t
 703 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 704 {
 705   const unsigned char *p = str, *endp = str + bytes;
 706   unsigned char *to;
 707   int c, len;
 708
 709   while (p < endp)
 710     {
 711       c = *p;
 712       len = BYTES_BY_CHAR_HEAD (c);
 713       if (CHAR_BYTE8_HEAD_P (c))
 714         break;
 715       p += len;
 716     }
 717   to = str + (p - str);
 718   while (p < endp)
 719     {
 720       c = *p;
 721       len = BYTES_BY_CHAR_HEAD (c);
 722       if (CHAR_BYTE8_HEAD_P (c))
 723         {
 724           c = STRING_CHAR_ADVANCE (p);
 725           *to++ = CHAR_TO_BYTE8 (c);
 726         }
 727       else
 728         {
 729           while (len--) *to++ = *p++;
 730         }
 731     }
 732   return (to - str);
 733 }
 734
 735 /* Convert eight-bit chars in SRC (in multibyte form) to the
 736    corresponding byte and store in DST.  CHARS is the number of
 737    characters in SRC.  The value is the number of bytes stored in DST.
 738    Usually, the value is the same as CHARS, but is less than it if SRC
 739    contains a non-ASCII, non-eight-bit character.  */
 740
 741 ptrdiff_t
 742 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 743 {
 744   ptrdiff_t i;
 745
 746   for (i = 0; i < chars; i++)
 747     {
 748       int c = STRING_CHAR_ADVANCE (src);
 749
 750       if (CHAR_BYTE8_P (c))
 751         c = CHAR_TO_BYTE8 (c);
 752       else if (! ASCII_CHAR_P (c))
 753         return i;
 754       *dst++ = c;
 755     }
 756   return i;
 757 }
 758
 759
 760 static ptrdiff_t
 761 string_count_byte8 (Lisp_Object string)
 762 {
 763   bool multibyte = STRING_MULTIBYTE (string);
 764   ptrdiff_t nbytes = SBYTES (string);
 765   unsigned char *p = SDATA (string);
 766   unsigned char *pend = p + nbytes;
 767   ptrdiff_t count = 0;
 768   int c, len;
 769
 770   if (multibyte)
 771     while (p < pend)
 772       {
 773         c = *p;
 774         len = BYTES_BY_CHAR_HEAD (c);
 775
 776         if (CHAR_BYTE8_HEAD_P (c))
 777           count++;
 778         p += len;
 779       }
 780   else
 781     while (p < pend)
 782       {
 783         if (*p++ >= 0x80)
 784           count++;
 785       }
 786   return count;
 787 }
 788
 789
 790 Lisp_Object
 791 string_escape_byte8 (Lisp_Object string)
 792 {
 793   ptrdiff_t nchars = SCHARS (string);
 794   ptrdiff_t nbytes = SBYTES (string);
 795   bool multibyte = STRING_MULTIBYTE (string);
 796   ptrdiff_t byte8_count;
 797   const unsigned char *src, *src_end;
 798   unsigned char *dst;
 799   Lisp_Object val;
 800   int c, len;
 801
 802   if (multibyte && nchars == nbytes)
 803     return string;
 804
 805   byte8_count = string_count_byte8 (string);
 806
 807   if (byte8_count == 0)
 808     return string;
 809
 810   if (multibyte)
 811     {
 812       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 813           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 814         string_overflow ();
 815
 816       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 817       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 818                                           nbytes + byte8_count * 2);
 819     }
 820   else
 821     {
 822       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 823         string_overflow ();
 824
 825       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 826       val = make_uninit_string (nbytes + byte8_count * 3);
 827     }
 828
 829   src = SDATA (string);
 830   src_end = src + nbytes;
 831   dst = SDATA (val);
 832   if (multibyte)
 833     while (src < src_end)
 834       {
 835         c = *src;
 836         len = BYTES_BY_CHAR_HEAD (c);
 837
 838         if (CHAR_BYTE8_HEAD_P (c))
 839           {
 840             c = STRING_CHAR_ADVANCE (src);
 841             c = CHAR_TO_BYTE8 (c);
 842             dst += sprintf ((char *) dst, "\\%03o", c);
 843           }
 844         else
 845           while (len--) *dst++ = *src++;
 846       }
 847   else
 848     while (src < src_end)
 849       {
 850         c = *src++;
 851         if (c >= 0x80)
 852           dst += sprintf ((char *) dst, "\\%03o", c);
 853         else
 854           *dst++ = c;
 855       }
 856   return val;
 857 }
 858
 859 \f
 860 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 861        doc: /*
 862 Concatenate all the argument characters and make the result a string.
 863 usage: (string &rest CHARACTERS)  */)
 864   (ptrdiff_t n, Lisp_Object *args)
 865 {
 866   ptrdiff_t i;
 867   int c;
 868   unsigned char *buf, *p;
 869   Lisp_Object str;
 870   USE_SAFE_ALLOCA;
 871
 872   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 873   p = buf;
 874
 875   for (i = 0; i < n; i++)
 876     {
 877       CHECK_CHARACTER (args[i]);
 878       c = XINT (args[i]);
 879       p += CHAR_STRING (c, p);
 880     }
 881
 882   str = make_string_from_bytes ((char *) buf, n, p - buf);
 883   SAFE_FREE ();
 884   return str;
 885 }
 886
 887 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 888        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 889 usage: (unibyte-string &rest BYTES)  */)
 890   (ptrdiff_t n, Lisp_Object *args)
 891 {
 892   ptrdiff_t i;
 893   Lisp_Object str;
 894   USE_SAFE_ALLOCA;
 895   unsigned char *buf = SAFE_ALLOCA (n);
 896   unsigned char *p = buf;
 897
 898   for (i = 0; i < n; i++)
 899     {
 900       CHECK_RANGED_INTEGER (args[i], 0, 255);
 901       *p++ = XINT (args[i]);
 902     }
 903
 904   str = make_string_from_bytes ((char *) buf, n, p - buf);
 905   SAFE_FREE ();
 906   return str;
 907 }
 908
 909 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 910        Schar_resolve_modifiers, 1, 1, 0,
 911        doc: /* Resolve modifiers in the character CHAR.
 912 The value is a character with modifiers resolved into the character
 913 code.  Unresolved modifiers are kept in the value.
 914 usage: (char-resolve-modifiers CHAR)  */)
 915   (Lisp_Object character)
 916 {
 917   EMACS_INT c;
 918
 919   CHECK_NUMBER (character);
 920   c = XINT (character);
 921   return make_number (char_resolve_modifier_mask (c));
 922 }
 923
 924 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 925        doc: /* Return a byte value of a character at point.
 926 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 927 a byte value.
 928 Optional 2nd arg STRING, if non-nil, is a string of which first
 929 character is a target to get a byte value.  In this case, POSITION, if
 930 non-nil, is an index of a target character in the string.
 931
 932 If the current buffer (or STRING) is multibyte, and the target
 933 character is not ASCII nor 8-bit character, an error is signaled.  */)
 934   (Lisp_Object position, Lisp_Object string)
 935 {
 936   int c;
 937   ptrdiff_t pos;
 938   unsigned char *p;
 939
 940   if (NILP (string))
 941     {
 942       if (NILP (position))
 943         {
 944           p = PT_ADDR;
 945         }
 946       else
 947         {
 948           CHECK_NUMBER_COERCE_MARKER (position);
 949           if (XINT (position) < BEGV || XINT (position) >= ZV)
 950             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 951           pos = XFASTINT (position);
 952           p = CHAR_POS_ADDR (pos);
 953         }
 954       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 955         return make_number (*p);
 956     }
 957   else
 958     {
 959       CHECK_STRING (string);
 960       if (NILP (position))
 961         {
 962           p = SDATA (string);
 963         }
 964       else
 965         {
 966           CHECK_NATNUM (position);
 967           if (XINT (position) >= SCHARS (string))
 968             args_out_of_range (string, position);
 969           pos = XFASTINT (position);
 970           p = SDATA (string) + string_char_to_byte (string, pos);
 971         }
 972       if (! STRING_MULTIBYTE (string))
 973         return make_number (*p);
 974     }
 975   c = STRING_CHAR (p);
 976   if (CHAR_BYTE8_P (c))
 977     c = CHAR_TO_BYTE8 (c);
 978   else if (! ASCII_CHAR_P (c))
 979     error ("Not an ASCII nor an 8-bit character: %d", c);
 980   return make_number (c);
 981 }
 982
 983 #ifdef emacs
 984
 985 void
 986 syms_of_character (void)
 987 {
 988   DEFSYM (Qcharacterp, "characterp");
 989   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 990
 991   staticpro (&Vchar_unify_table);
 992   Vchar_unify_table = Qnil;
 993
 994   defsubr (&Smax_char);
 995   defsubr (&Scharacterp);
 996   defsubr (&Sunibyte_char_to_multibyte);
 997   defsubr (&Smultibyte_char_to_unibyte);
 998   defsubr (&Schar_width);
 999   defsubr (&Sstring_width);
1000   defsubr (&Sstring);
1001   defsubr (&Sunibyte_string);
1002   defsubr (&Schar_resolve_modifiers);
1003   defsubr (&Sget_byte);
1004
1005   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1006                doc: /*
1007 Vector recording all translation tables ever defined.
1008 Each element is a pair (SYMBOL . TABLE) relating the table to the
1009 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1010   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1011
1012   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1013                doc: /*
1014 A char-table for characters which invoke auto-filling.
1015 Such characters have value t in this table.  */);
1016   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1017   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1018   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1019
1020   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1021                doc: /*
1022 A char-table for width (columns) of each character.  */);
1023   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1024   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1025   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1026                         make_number (4));
1027
1028   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1029                doc: /* A char-table for each printable character.  */);
1030   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1031   Fset_char_table_range (Vprintable_chars,
1032                          Fcons (make_number (32), make_number (126)), Qt);
1033   Fset_char_table_range (Vprintable_chars,
1034                          Fcons (make_number (160),
1035                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1036
1037   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1038                doc: /* Char table of script symbols.
1039 It has one extra slot whose value is a list of script symbols.  */);
1040
1041   DEFSYM (Qchar_script_table, "char-script-table");
1042   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1043   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1044
1045   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1046                doc: /* Alist of scripts vs the representative characters.
1047 Each element is a cons (SCRIPT . CHARS).
1048 SCRIPT is a symbol representing a script or a subgroup of a script.
1049 CHARS is a list or a vector of characters.
1050 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1051 If it is a vector, one of the characters in the vector is necessary.
1052 This variable is used to find a font for a specific script.  */);
1053   Vscript_representative_chars = Qnil;
1054
1055   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1056                doc: /* Char table of Unicode's "General Category".
1057 All Unicode characters have one of the following values (symbol):
1058   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1059   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1060 See The Unicode Standard for the meaning of those values.  */);
1061   /* The correct char-table is setup in characters.el.  */
1062   Vunicode_category_table = Qnil;
1063 }
1064
1065 #endif /* emacs */