src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2014 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <intprops.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 static Lisp_Object Qauto_fill_chars;
  54
  55 /* Char-table of information about which character to unify to which
  56    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  57 Lisp_Object Vchar_unify_table;
  58
  59 static Lisp_Object Qchar_script_table;
  60
  61 \f
  62
  63 /* If character code C has modifier masks, reflect them to the
  64    character code if possible.  Return the resulting code.  */
  65
  66 EMACS_INT
  67 char_resolve_modifier_mask (EMACS_INT c)
  68 {
  69   /* A non-ASCII character can't reflect modifier bits to the code.  */
  70   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  71     return c;
  72
  73   /* For Meta, Shift, and Control modifiers, we need special care.  */
  74   if (c & CHAR_SHIFT)
  75     {
  76       /* Shift modifier is valid only with [A-Za-z].  */
  77       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  78         c &= ~CHAR_SHIFT;
  79       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  80         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  81       /* Shift modifier for control characters and SPC is ignored.  */
  82       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  83         c &= ~CHAR_SHIFT;
  84     }
  85   if (c & CHAR_CTL)
  86     {
  87       /* Simulate the code in lread.c.  */
  88       /* Allow `\C- ' and `\C-?'.  */
  89       if ((c & 0377) == ' ')
  90         c &= ~0177 & ~ CHAR_CTL;
  91       else if ((c & 0377) == '?')
  92         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  93       /* ASCII control chars are made from letters (both cases),
  94          as well as the non-letters within 0100...0137.  */
  95       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  96         c &= (037 | (~0177 & ~CHAR_CTL));
  97       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  98         c &= (037 | (~0177 & ~CHAR_CTL));
  99     }
 100 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 101   if (c & CHAR_META)
 102     {
 103       /* Move the meta bit to the right place for a string.  */
 104       c = (c & ~CHAR_META) | 0x80;
 105     }
 106 #endif
 107
 108   return c;
 109 }
 110
 111
 112 /* Store multibyte form of character C at P.  If C has modifier bits,
 113    handle them appropriately.  */
 114
 115 int
 116 char_string (unsigned int c, unsigned char *p)
 117 {
 118   int bytes;
 119
 120   if (c & CHAR_MODIFIER_MASK)
 121     {
 122       c = char_resolve_modifier_mask (c);
 123       /* If C still has any modifier bits, just ignore it.  */
 124       c &= ~CHAR_MODIFIER_MASK;
 125     }
 126
 127   if (c <= MAX_3_BYTE_CHAR)
 128     {
 129       bytes = CHAR_STRING (c, p);
 130     }
 131   else if (c <= MAX_4_BYTE_CHAR)
 132     {
 133       p[0] = (0xF0 | (c >> 18));
 134       p[1] = (0x80 | ((c >> 12) & 0x3F));
 135       p[2] = (0x80 | ((c >> 6) & 0x3F));
 136       p[3] = (0x80 | (c & 0x3F));
 137       bytes = 4;
 138     }
 139   else if (c <= MAX_5_BYTE_CHAR)
 140     {
 141       p[0] = 0xF8;
 142       p[1] = (0x80 | ((c >> 18) & 0x0F));
 143       p[2] = (0x80 | ((c >> 12) & 0x3F));
 144       p[3] = (0x80 | ((c >> 6) & 0x3F));
 145       p[4] = (0x80 | (c & 0x3F));
 146       bytes = 5;
 147     }
 148   else if (c <= MAX_CHAR)
 149     {
 150       c = CHAR_TO_BYTE8 (c);
 151       bytes = BYTE8_STRING (c, p);
 152     }
 153   else
 154     error ("Invalid character: %x", c);
 155
 156   return bytes;
 157 }
 158
 159
 160 /* Return a character whose multibyte form is at P.  If LEN is not
 161    NULL, it must be a pointer to integer.  In that case, set *LEN to
 162    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 163    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 164    the ending address (i.e., the starting address of the next
 165    character) of the multibyte form.  */
 166
 167 int
 168 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 169 {
 170   int c;
 171   const unsigned char *saved_p = p;
 172
 173   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 174     {
 175       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 176       c = STRING_CHAR_ADVANCE (p);
 177     }
 178   else if (! (*p & 0x08))
 179     {
 180       /* A 4-byte sequence of this form:
 181          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 182       c = ((((p)[0] & 0x7) << 18)
 183            | (((p)[1] & 0x3F) << 12)
 184            | (((p)[2] & 0x3F) << 6)
 185            | ((p)[3] & 0x3F));
 186       p += 4;
 187     }
 188   else
 189     {
 190       /* A 5-byte sequence of this form:
 191
 192          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 193
 194          Note that the top 4 `x's are always 0, so shifting p[1] can
 195          never exceed the maximum valid character codepoint. */
 196       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 197            (((p)[1] & 0x3F) << 18)
 198            | (((p)[2] & 0x3F) << 12)
 199            | (((p)[3] & 0x3F) << 6)
 200            | ((p)[4] & 0x3F));
 201       p += 5;
 202     }
 203
 204   if (len)
 205     *len = p - saved_p;
 206   if (advanced)
 207     *advanced = p;
 208   return c;
 209 }
 210
 211
 212 /* Translate character C by translation table TABLE.  If no translation is
 213    found in TABLE, return the untranslated character.  If TABLE is a list,
 214    elements are char tables.  In that case, recursively translate C by all the
 215    tables in the list.  */
 216
 217 int
 218 translate_char (Lisp_Object table, int c)
 219 {
 220   if (CHAR_TABLE_P (table))
 221     {
 222       Lisp_Object ch;
 223
 224       ch = CHAR_TABLE_REF (table, c);
 225       if (CHARACTERP (ch))
 226         c = XINT (ch);
 227     }
 228   else
 229     {
 230       for (; CONSP (table); table = XCDR (table))
 231         c = translate_char (XCAR (table), c);
 232     }
 233   return c;
 234 }
 235
 236 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 237        doc: /* Return non-nil if OBJECT is a character.
 238 In Emacs Lisp, characters are represented by character codes, which
 239 are non-negative integers.  The function `max-char' returns the
 240 maximum character code.
 241 usage: (characterp OBJECT)  */)
 242   (Lisp_Object object, Lisp_Object ignore)
 243 {
 244   return (CHARACTERP (object) ? Qt : Qnil);
 245 }
 246
 247 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 248        doc: /* Return the character of the maximum code.  */)
 249   (void)
 250 {
 251   return make_number (MAX_CHAR);
 252 }
 253
 254 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 255        Sunibyte_char_to_multibyte, 1, 1, 0,
 256        doc: /* Convert the byte CH to multibyte character.  */)
 257   (Lisp_Object ch)
 258 {
 259   int c;
 260
 261   CHECK_CHARACTER (ch);
 262   c = XFASTINT (ch);
 263   if (c >= 0x100)
 264     error ("Not a unibyte character: %d", c);
 265   MAKE_CHAR_MULTIBYTE (c);
 266   return make_number (c);
 267 }
 268
 269 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 270        Smultibyte_char_to_unibyte, 1, 1, 0,
 271        doc: /* Convert the multibyte character CH to a byte.
 272 If the multibyte character does not represent a byte, return -1.  */)
 273   (Lisp_Object ch)
 274 {
 275   int cm;
 276
 277   CHECK_CHARACTER (ch);
 278   cm = XFASTINT (ch);
 279   if (cm < 256)
 280     /* Can't distinguish a byte read from a unibyte buffer from
 281        a latin1 char, so let's let it slide.  */
 282     return ch;
 283   else
 284     {
 285       int cu = CHAR_TO_BYTE_SAFE (cm);
 286       return make_number (cu);
 287     }
 288 }
 289
 290
 291 /* Return width (columns) of C considering the buffer display table DP. */
 292
 293 static ptrdiff_t
 294 char_width (int c, struct Lisp_Char_Table *dp)
 295 {
 296   ptrdiff_t width = CHAR_WIDTH (c);
 297
 298   if (dp)
 299     {
 300       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 301       int i;
 302
 303       if (VECTORP (disp))
 304         for (i = 0, width = 0; i < ASIZE (disp); i++)
 305           {
 306             ch = AREF (disp, i);
 307             if (CHARACTERP (ch))
 308               {
 309                 int w = CHAR_WIDTH (XFASTINT (ch));
 310                 if (INT_ADD_OVERFLOW (width, w))
 311                   string_overflow ();
 312                 width += w;
 313               }
 314           }
 315     }
 316   return width;
 317 }
 318
 319
 320 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 321        doc: /* Return width of CHAR when displayed in the current buffer.
 322 The width is measured by how many columns it occupies on the screen.
 323 Tab is taken to occupy `tab-width' columns.
 324 usage: (char-width CHAR)  */)
 325   (Lisp_Object ch)
 326 {
 327   int c;
 328   ptrdiff_t width;
 329
 330   CHECK_CHARACTER (ch);
 331   c = XINT (ch);
 332   width = char_width (c, buffer_display_table ());
 333   return make_number (width);
 334 }
 335
 336 /* Return width of string STR of length LEN when displayed in the
 337    current buffer.  The width is measured by how many columns it
 338    occupies on the screen.  If PRECISION > 0, return the width of
 339    longest substring that doesn't exceed PRECISION, and set number of
 340    characters and bytes of the substring in *NCHARS and *NBYTES
 341    respectively.  */
 342
 343 ptrdiff_t
 344 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 345                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 346 {
 347   ptrdiff_t i = 0, i_byte = 0;
 348   ptrdiff_t width = 0;
 349   struct Lisp_Char_Table *dp = buffer_display_table ();
 350
 351   while (i_byte < len)
 352     {
 353       int bytes;
 354       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 355       ptrdiff_t thiswidth = char_width (c, dp);
 356
 357       if (precision <= 0)
 358         {
 359           if (INT_ADD_OVERFLOW (width, thiswidth))
 360             string_overflow ();
 361         }
 362       else if (precision - width < thiswidth)
 363         {
 364           *nchars = i;
 365           *nbytes = i_byte;
 366           return width;
 367         }
 368       i++;
 369       i_byte += bytes;
 370       width += thiswidth;
 371   }
 372
 373   if (precision > 0)
 374     {
 375       *nchars = i;
 376       *nbytes = i_byte;
 377     }
 378
 379   return width;
 380 }
 381
 382 /* Return width of string STR of length LEN when displayed in the
 383    current buffer.  The width is measured by how many columns it
 384    occupies on the screen.  */
 385
 386 ptrdiff_t
 387 strwidth (const char *str, ptrdiff_t len)
 388 {
 389   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 390 }
 391
 392 /* Return width of Lisp string STRING when displayed in the current
 393    buffer.  The width is measured by how many columns it occupies on
 394    the screen while paying attention to compositions.  If PRECISION >
 395    0, return the width of longest substring that doesn't exceed
 396    PRECISION, and set number of characters and bytes of the substring
 397    in *NCHARS and *NBYTES respectively.  */
 398
 399 ptrdiff_t
 400 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 401                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 402 {
 403   ptrdiff_t len = SCHARS (string);
 404   /* This set multibyte to 0 even if STRING is multibyte when it
 405      contains only ascii and eight-bit-graphic, but that's
 406      intentional.  */
 407   bool multibyte = len < SBYTES (string);
 408   unsigned char *str = SDATA (string);
 409   ptrdiff_t i = 0, i_byte = 0;
 410   ptrdiff_t width = 0;
 411   struct Lisp_Char_Table *dp = buffer_display_table ();
 412
 413   while (i < len)
 414     {
 415       ptrdiff_t chars, bytes, thiswidth;
 416       Lisp_Object val;
 417       ptrdiff_t cmp_id;
 418       ptrdiff_t ignore, end;
 419
 420       if (find_composition (i, -1, &ignore, &end, &val, string)
 421           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 422               >= 0))
 423         {
 424           thiswidth = composition_table[cmp_id]->width;
 425           chars = end - i;
 426           bytes = string_char_to_byte (string, end) - i_byte;
 427         }
 428       else
 429         {
 430           int c;
 431
 432           if (multibyte)
 433             {
 434               int cbytes;
 435               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 436               bytes = cbytes;
 437             }
 438           else
 439             c = str[i_byte], bytes = 1;
 440           chars = 1;
 441           thiswidth = char_width (c, dp);
 442         }
 443
 444       if (precision <= 0)
 445         {
 446 #ifdef emacs
 447           if (INT_ADD_OVERFLOW (width, thiswidth))
 448             string_overflow ();
 449 #endif
 450         }
 451       else if (precision - width < thiswidth)
 452         {
 453           *nchars = i;
 454           *nbytes = i_byte;
 455           return width;
 456         }
 457       i += chars;
 458       i_byte += bytes;
 459       width += thiswidth;
 460     }
 461
 462   if (precision > 0)
 463     {
 464       *nchars = i;
 465       *nbytes = i_byte;
 466     }
 467
 468   return width;
 469 }
 470
 471 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 472        doc: /* Return width of STRING when displayed in the current buffer.
 473 Width is measured by how many columns it occupies on the screen.
 474 When calculating width of a multibyte character in STRING,
 475 only the base leading-code is considered; the validity of
 476 the following bytes is not checked.  Tabs in STRING are always
 477 taken to occupy `tab-width' columns.
 478 usage: (string-width STRING)  */)
 479   (Lisp_Object str)
 480 {
 481   Lisp_Object val;
 482
 483   CHECK_STRING (str);
 484   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 485   return val;
 486 }
 487
 488 /* Return the number of characters in the NBYTES bytes at PTR.
 489    This works by looking at the contents and checking for multibyte
 490    sequences while assuming that there's no invalid sequence.
 491    However, if the current buffer has enable-multibyte-characters =
 492    nil, we treat each byte as a character.  */
 493
 494 ptrdiff_t
 495 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 496 {
 497   /* current_buffer is null at early stages of Emacs initialization.  */
 498   if (current_buffer == 0
 499       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 500     return nbytes;
 501
 502   return multibyte_chars_in_text (ptr, nbytes);
 503 }
 504
 505 /* Return the number of characters in the NBYTES bytes at PTR.
 506    This works by looking at the contents and checking for multibyte
 507    sequences while assuming that there's no invalid sequence.  It
 508    ignores enable-multibyte-characters.  */
 509
 510 ptrdiff_t
 511 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 512 {
 513   const unsigned char *endp = ptr + nbytes;
 514   ptrdiff_t chars = 0;
 515
 516   while (ptr < endp)
 517     {
 518       int len = MULTIBYTE_LENGTH (ptr, endp);
 519
 520       if (len == 0)
 521         emacs_abort ();
 522       ptr += len;
 523       chars++;
 524     }
 525
 526   return chars;
 527 }
 528
 529 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 530    characters and bytes in it, and store them in *NCHARS and *NBYTES
 531    respectively.  On counting bytes, pay attention to that 8-bit
 532    characters not constructing a valid multibyte sequence are
 533    represented by 2-byte in a multibyte text.  */
 534
 535 void
 536 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 537                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 538 {
 539   const unsigned char *endp = str + len;
 540   int n;
 541   ptrdiff_t chars = 0, bytes = 0;
 542
 543   if (len >= MAX_MULTIBYTE_LENGTH)
 544     {
 545       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 546       while (str < adjusted_endp)
 547         {
 548           if (! CHAR_BYTE8_HEAD_P (*str)
 549               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 550             str += n, bytes += n;
 551           else
 552             str++, bytes += 2;
 553           chars++;
 554         }
 555     }
 556   while (str < endp)
 557     {
 558       if (! CHAR_BYTE8_HEAD_P (*str)
 559           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 560         str += n, bytes += n;
 561       else
 562         str++, bytes += 2;
 563       chars++;
 564     }
 565
 566   *nchars = chars;
 567   *nbytes = bytes;
 568   return;
 569 }
 570
 571 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 572    It actually converts only such 8-bit characters that don't construct
 573    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 574    NCHARS is nonzero, set *NCHARS to the number of characters in the
 575    text.  It is assured that we can use LEN bytes at STR as a work
 576    area and that is enough.  Return the number of bytes of the
 577    resulting text.  */
 578
 579 ptrdiff_t
 580 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 581                   ptrdiff_t *nchars)
 582 {
 583   unsigned char *p = str, *endp = str + nbytes;
 584   unsigned char *to;
 585   ptrdiff_t chars = 0;
 586   int n;
 587
 588   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 589     {
 590       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 591       while (p < adjusted_endp
 592              && ! CHAR_BYTE8_HEAD_P (*p)
 593              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 594         p += n, chars++;
 595     }
 596   while (p < endp
 597          && ! CHAR_BYTE8_HEAD_P (*p)
 598          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 599     p += n, chars++;
 600   if (nchars)
 601     *nchars = chars;
 602   if (p == endp)
 603     return nbytes;
 604
 605   to = p;
 606   nbytes = endp - p;
 607   endp = str + len;
 608   memmove (endp - nbytes, p, nbytes);
 609   p = endp - nbytes;
 610
 611   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 612     {
 613       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 614       while (p < adjusted_endp)
 615         {
 616           if (! CHAR_BYTE8_HEAD_P (*p)
 617               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 618             {
 619               while (n--)
 620                 *to++ = *p++;
 621             }
 622           else
 623             {
 624               int c = *p++;
 625               c = BYTE8_TO_CHAR (c);
 626               to += CHAR_STRING (c, to);
 627             }
 628         }
 629       chars++;
 630     }
 631   while (p < endp)
 632     {
 633       if (! CHAR_BYTE8_HEAD_P (*p)
 634           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 635         {
 636           while (n--)
 637             *to++ = *p++;
 638         }
 639       else
 640         {
 641           int c = *p++;
 642           c = BYTE8_TO_CHAR (c);
 643           to += CHAR_STRING (c, to);
 644         }
 645       chars++;
 646     }
 647   if (nchars)
 648     *nchars = chars;
 649   return (to - str);
 650 }
 651
 652 /* Parse unibyte string at STR of LEN bytes, and return the number of
 653    bytes it may occupy when converted to multibyte string by
 654    `str_to_multibyte'.  */
 655
 656 ptrdiff_t
 657 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 658 {
 659   const unsigned char *endp = str + len;
 660   ptrdiff_t bytes;
 661
 662   for (bytes = 0; str < endp; str++)
 663     {
 664       int n = *str < 0x80 ? 1 : 2;
 665       if (INT_ADD_OVERFLOW (bytes, n))
 666         string_overflow ();
 667       bytes += n;
 668     }
 669   return bytes;
 670 }
 671
 672
 673 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 674    that contains the same single-byte characters.  It actually
 675    converts all 8-bit characters to multibyte forms.  It is assured
 676    that we can use LEN bytes at STR as a work area and that is
 677    enough.  */
 678
 679 ptrdiff_t
 680 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 681 {
 682   unsigned char *p = str, *endp = str + bytes;
 683   unsigned char *to;
 684
 685   while (p < endp && *p < 0x80) p++;
 686   if (p == endp)
 687     return bytes;
 688   to = p;
 689   bytes = endp - p;
 690   endp = str + len;
 691   memmove (endp - bytes, p, bytes);
 692   p = endp - bytes;
 693   while (p < endp)
 694     {
 695       int c = *p++;
 696
 697       if (c >= 0x80)
 698         c = BYTE8_TO_CHAR (c);
 699       to += CHAR_STRING (c, to);
 700     }
 701   return (to - str);
 702 }
 703
 704 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 705    actually converts characters in the range 0x80..0xFF to
 706    unibyte.  */
 707
 708 ptrdiff_t
 709 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 710 {
 711   const unsigned char *p = str, *endp = str + bytes;
 712   unsigned char *to;
 713   int c, len;
 714
 715   while (p < endp)
 716     {
 717       c = *p;
 718       len = BYTES_BY_CHAR_HEAD (c);
 719       if (CHAR_BYTE8_HEAD_P (c))
 720         break;
 721       p += len;
 722     }
 723   to = str + (p - str);
 724   while (p < endp)
 725     {
 726       c = *p;
 727       len = BYTES_BY_CHAR_HEAD (c);
 728       if (CHAR_BYTE8_HEAD_P (c))
 729         {
 730           c = STRING_CHAR_ADVANCE (p);
 731           *to++ = CHAR_TO_BYTE8 (c);
 732         }
 733       else
 734         {
 735           while (len--) *to++ = *p++;
 736         }
 737     }
 738   return (to - str);
 739 }
 740
 741 /* Convert eight-bit chars in SRC (in multibyte form) to the
 742    corresponding byte and store in DST.  CHARS is the number of
 743    characters in SRC.  The value is the number of bytes stored in DST.
 744    Usually, the value is the same as CHARS, but is less than it if SRC
 745    contains a non-ASCII, non-eight-bit character.  */
 746
 747 ptrdiff_t
 748 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 749 {
 750   ptrdiff_t i;
 751
 752   for (i = 0; i < chars; i++)
 753     {
 754       int c = STRING_CHAR_ADVANCE (src);
 755
 756       if (CHAR_BYTE8_P (c))
 757         c = CHAR_TO_BYTE8 (c);
 758       else if (! ASCII_CHAR_P (c))
 759         return i;
 760       *dst++ = c;
 761     }
 762   return i;
 763 }
 764
 765
 766 static ptrdiff_t
 767 string_count_byte8 (Lisp_Object string)
 768 {
 769   bool multibyte = STRING_MULTIBYTE (string);
 770   ptrdiff_t nbytes = SBYTES (string);
 771   unsigned char *p = SDATA (string);
 772   unsigned char *pend = p + nbytes;
 773   ptrdiff_t count = 0;
 774   int c, len;
 775
 776   if (multibyte)
 777     while (p < pend)
 778       {
 779         c = *p;
 780         len = BYTES_BY_CHAR_HEAD (c);
 781
 782         if (CHAR_BYTE8_HEAD_P (c))
 783           count++;
 784         p += len;
 785       }
 786   else
 787     while (p < pend)
 788       {
 789         if (*p++ >= 0x80)
 790           count++;
 791       }
 792   return count;
 793 }
 794
 795
 796 Lisp_Object
 797 string_escape_byte8 (Lisp_Object string)
 798 {
 799   ptrdiff_t nchars = SCHARS (string);
 800   ptrdiff_t nbytes = SBYTES (string);
 801   bool multibyte = STRING_MULTIBYTE (string);
 802   ptrdiff_t byte8_count;
 803   const unsigned char *src, *src_end;
 804   unsigned char *dst;
 805   Lisp_Object val;
 806   int c, len;
 807
 808   if (multibyte && nchars == nbytes)
 809     return string;
 810
 811   byte8_count = string_count_byte8 (string);
 812
 813   if (byte8_count == 0)
 814     return string;
 815
 816   if (multibyte)
 817     {
 818       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 819           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 820         string_overflow ();
 821
 822       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 823       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 824                                           nbytes + byte8_count * 2);
 825     }
 826   else
 827     {
 828       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 829         string_overflow ();
 830
 831       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 832       val = make_uninit_string (nbytes + byte8_count * 3);
 833     }
 834
 835   src = SDATA (string);
 836   src_end = src + nbytes;
 837   dst = SDATA (val);
 838   if (multibyte)
 839     while (src < src_end)
 840       {
 841         c = *src;
 842         len = BYTES_BY_CHAR_HEAD (c);
 843
 844         if (CHAR_BYTE8_HEAD_P (c))
 845           {
 846             c = STRING_CHAR_ADVANCE (src);
 847             c = CHAR_TO_BYTE8 (c);
 848             dst += sprintf ((char *) dst, "\\%03o", c);
 849           }
 850         else
 851           while (len--) *dst++ = *src++;
 852       }
 853   else
 854     while (src < src_end)
 855       {
 856         c = *src++;
 857         if (c >= 0x80)
 858           dst += sprintf ((char *) dst, "\\%03o", c);
 859         else
 860           *dst++ = c;
 861       }
 862   return val;
 863 }
 864
 865 \f
 866 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 867        doc: /*
 868 Concatenate all the argument characters and make the result a string.
 869 usage: (string &rest CHARACTERS)  */)
 870   (ptrdiff_t n, Lisp_Object *args)
 871 {
 872   ptrdiff_t i;
 873   int c;
 874   unsigned char *buf, *p;
 875   Lisp_Object str;
 876   USE_SAFE_ALLOCA;
 877
 878   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 879   p = buf;
 880
 881   for (i = 0; i < n; i++)
 882     {
 883       CHECK_CHARACTER (args[i]);
 884       c = XINT (args[i]);
 885       p += CHAR_STRING (c, p);
 886     }
 887
 888   str = make_string_from_bytes ((char *) buf, n, p - buf);
 889   SAFE_FREE ();
 890   return str;
 891 }
 892
 893 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 894        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 895 usage: (unibyte-string &rest BYTES)  */)
 896   (ptrdiff_t n, Lisp_Object *args)
 897 {
 898   ptrdiff_t i;
 899   Lisp_Object str;
 900   USE_SAFE_ALLOCA;
 901   unsigned char *buf = SAFE_ALLOCA (n);
 902   unsigned char *p = buf;
 903
 904   for (i = 0; i < n; i++)
 905     {
 906       CHECK_RANGED_INTEGER (args[i], 0, 255);
 907       *p++ = XINT (args[i]);
 908     }
 909
 910   str = make_string_from_bytes ((char *) buf, n, p - buf);
 911   SAFE_FREE ();
 912   return str;
 913 }
 914
 915 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 916        Schar_resolve_modifiers, 1, 1, 0,
 917        doc: /* Resolve modifiers in the character CHAR.
 918 The value is a character with modifiers resolved into the character
 919 code.  Unresolved modifiers are kept in the value.
 920 usage: (char-resolve-modifiers CHAR)  */)
 921   (Lisp_Object character)
 922 {
 923   EMACS_INT c;
 924
 925   CHECK_NUMBER (character);
 926   c = XINT (character);
 927   return make_number (char_resolve_modifier_mask (c));
 928 }
 929
 930 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 931        doc: /* Return a byte value of a character at point.
 932 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 933 a byte value.
 934 Optional 2nd arg STRING, if non-nil, is a string of which first
 935 character is a target to get a byte value.  In this case, POSITION, if
 936 non-nil, is an index of a target character in the string.
 937
 938 If the current buffer (or STRING) is multibyte, and the target
 939 character is not ASCII nor 8-bit character, an error is signaled.  */)
 940   (Lisp_Object position, Lisp_Object string)
 941 {
 942   int c;
 943   ptrdiff_t pos;
 944   unsigned char *p;
 945
 946   if (NILP (string))
 947     {
 948       if (NILP (position))
 949         {
 950           p = PT_ADDR;
 951         }
 952       else
 953         {
 954           CHECK_NUMBER_COERCE_MARKER (position);
 955           if (XINT (position) < BEGV || XINT (position) >= ZV)
 956             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 957           pos = XFASTINT (position);
 958           p = CHAR_POS_ADDR (pos);
 959         }
 960       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 961         return make_number (*p);
 962     }
 963   else
 964     {
 965       CHECK_STRING (string);
 966       if (NILP (position))
 967         {
 968           p = SDATA (string);
 969         }
 970       else
 971         {
 972           CHECK_NATNUM (position);
 973           if (XINT (position) >= SCHARS (string))
 974             args_out_of_range (string, position);
 975           pos = XFASTINT (position);
 976           p = SDATA (string) + string_char_to_byte (string, pos);
 977         }
 978       if (! STRING_MULTIBYTE (string))
 979         return make_number (*p);
 980     }
 981   c = STRING_CHAR (p);
 982   if (CHAR_BYTE8_P (c))
 983     c = CHAR_TO_BYTE8 (c);
 984   else if (! ASCII_CHAR_P (c))
 985     error ("Not an ASCII nor an 8-bit character: %d", c);
 986   return make_number (c);
 987 }
 988
 989 #ifdef emacs
 990
 991 void
 992 syms_of_character (void)
 993 {
 994   DEFSYM (Qcharacterp, "characterp");
 995   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 996
 997   staticpro (&Vchar_unify_table);
 998   Vchar_unify_table = Qnil;
 999
1000   defsubr (&Smax_char);
1001   defsubr (&Scharacterp);
1002   defsubr (&Sunibyte_char_to_multibyte);
1003   defsubr (&Smultibyte_char_to_unibyte);
1004   defsubr (&Schar_width);
1005   defsubr (&Sstring_width);
1006   defsubr (&Sstring);
1007   defsubr (&Sunibyte_string);
1008   defsubr (&Schar_resolve_modifiers);
1009   defsubr (&Sget_byte);
1010
1011   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1012                doc: /*
1013 Vector recording all translation tables ever defined.
1014 Each element is a pair (SYMBOL . TABLE) relating the table to the
1015 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1016   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1017
1018   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1019                doc: /*
1020 A char-table for characters which invoke auto-filling.
1021 Such characters have value t in this table.  */);
1022   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1023   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1024   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1025
1026   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1027                doc: /*
1028 A char-table for width (columns) of each character.  */);
1029   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1030   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1031   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1032                         make_number (4));
1033
1034   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1035                doc: /* A char-table for each printable character.  */);
1036   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1037   Fset_char_table_range (Vprintable_chars,
1038                          Fcons (make_number (32), make_number (126)), Qt);
1039   Fset_char_table_range (Vprintable_chars,
1040                          Fcons (make_number (160),
1041                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1042
1043   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1044                doc: /* Char table of script symbols.
1045 It has one extra slot whose value is a list of script symbols.  */);
1046
1047   DEFSYM (Qchar_script_table, "char-script-table");
1048   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1049   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1050
1051   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1052                doc: /* Alist of scripts vs the representative characters.
1053 Each element is a cons (SCRIPT . CHARS).
1054 SCRIPT is a symbol representing a script or a subgroup of a script.
1055 CHARS is a list or a vector of characters.
1056 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1057 If it is a vector, one of the characters in the vector is necessary.
1058 This variable is used to find a font for a specific script.  */);
1059   Vscript_representative_chars = Qnil;
1060
1061   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1062                doc: /* Char table of Unicode's "General Category".
1063 All Unicode characters have one of the following values (symbol):
1064   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1065   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1066 See The Unicode Standard for the meaning of those values.  */);
1067   /* The correct char-table is setup in characters.el.  */
1068   Vunicode_category_table = Qnil;
1069 }
1070
1071 #endif /* emacs */