src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #define CHARACTER_INLINE EXTERN_INLINE
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include <setjmp.h>
  40 #include <intprops.h>
  41 #include "lisp.h"
  42 #include "character.h"
  43 #include "buffer.h"
  44 #include "charset.h"
  45 #include "composite.h"
  46 #include "disptab.h"
  47
  48 #else  /* not emacs */
  49
  50 #include "mulelib.h"
  51
  52 #endif /* emacs */
  53
  54 Lisp_Object Qcharacterp;
  55
  56 static Lisp_Object Qauto_fill_chars;
  57
  58 /* Char-table of information about which character to unify to which
  59    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  60 Lisp_Object Vchar_unify_table;
  61
  62 static Lisp_Object Qchar_script_table;
  63
  64 \f
  65
  66 /* If character code C has modifier masks, reflect them to the
  67    character code if possible.  Return the resulting code.  */
  68
  69 EMACS_INT
  70 char_resolve_modifier_mask (EMACS_INT c)
  71 {
  72   /* A non-ASCII character can't reflect modifier bits to the code.  */
  73   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  74     return c;
  75
  76   /* For Meta, Shift, and Control modifiers, we need special care.  */
  77   if (c & CHAR_SHIFT)
  78     {
  79       /* Shift modifier is valid only with [A-Za-z].  */
  80       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  81         c &= ~CHAR_SHIFT;
  82       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  83         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  84       /* Shift modifier for control characters and SPC is ignored.  */
  85       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  86         c &= ~CHAR_SHIFT;
  87     }
  88   if (c & CHAR_CTL)
  89     {
  90       /* Simulate the code in lread.c.  */
  91       /* Allow `\C- ' and `\C-?'.  */
  92       if ((c & 0377) == ' ')
  93         c &= ~0177 & ~ CHAR_CTL;
  94       else if ((c & 0377) == '?')
  95         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  96       /* ASCII control chars are made from letters (both cases),
  97          as well as the non-letters within 0100...0137.  */
  98       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 101         c &= (037 | (~0177 & ~CHAR_CTL));
 102     }
 103 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 104   if (c & CHAR_META)
 105     {
 106       /* Move the meta bit to the right place for a string.  */
 107       c = (c & ~CHAR_META) | 0x80;
 108     }
 109 #endif
 110
 111   return c;
 112 }
 113
 114
 115 /* Store multibyte form of character C at P.  If C has modifier bits,
 116    handle them appropriately.  */
 117
 118 int
 119 char_string (unsigned int c, unsigned char *p)
 120 {
 121   int bytes;
 122
 123   if (c & CHAR_MODIFIER_MASK)
 124     {
 125       c = char_resolve_modifier_mask (c);
 126       /* If C still has any modifier bits, just ignore it.  */
 127       c &= ~CHAR_MODIFIER_MASK;
 128     }
 129
 130   MAYBE_UNIFY_CHAR (c);
 131
 132   if (c <= MAX_3_BYTE_CHAR)
 133     {
 134       bytes = CHAR_STRING (c, p);
 135     }
 136   else if (c <= MAX_4_BYTE_CHAR)
 137     {
 138       p[0] = (0xF0 | (c >> 18));
 139       p[1] = (0x80 | ((c >> 12) & 0x3F));
 140       p[2] = (0x80 | ((c >> 6) & 0x3F));
 141       p[3] = (0x80 | (c & 0x3F));
 142       bytes = 4;
 143     }
 144   else if (c <= MAX_5_BYTE_CHAR)
 145     {
 146       p[0] = 0xF8;
 147       p[1] = (0x80 | ((c >> 18) & 0x0F));
 148       p[2] = (0x80 | ((c >> 12) & 0x3F));
 149       p[3] = (0x80 | ((c >> 6) & 0x3F));
 150       p[4] = (0x80 | (c & 0x3F));
 151       bytes = 5;
 152     }
 153   else if (c <= MAX_CHAR)
 154     {
 155       c = CHAR_TO_BYTE8 (c);
 156       bytes = BYTE8_STRING (c, p);
 157     }
 158   else
 159     error ("Invalid character: %x", c);
 160
 161   return bytes;
 162 }
 163
 164
 165 /* Return a character whose multibyte form is at P.  If LEN is not
 166    NULL, it must be a pointer to integer.  In that case, set *LEN to
 167    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 168    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 169    the ending address (i.e., the starting address of the next
 170    character) of the multibyte form.  */
 171
 172 int
 173 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 174 {
 175   int c;
 176   const unsigned char *saved_p = p;
 177
 178   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 179     {
 180       c = STRING_CHAR_ADVANCE (p);
 181     }
 182   else if (! (*p & 0x08))
 183     {
 184       c = ((((p)[0] & 0xF) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       c = ((((p)[1] & 0x3F) << 18)
 193            | (((p)[2] & 0x3F) << 12)
 194            | (((p)[3] & 0x3F) << 6)
 195            | ((p)[4] & 0x3F));
 196       p += 5;
 197     }
 198
 199   MAYBE_UNIFY_CHAR (c);
 200
 201   if (len)
 202     *len = p - saved_p;
 203   if (advanced)
 204     *advanced = p;
 205   return c;
 206 }
 207
 208
 209 /* Translate character C by translation table TABLE.  If no translation is
 210    found in TABLE, return the untranslated character.  If TABLE is a list,
 211    elements are char tables.  In that case, recursively translate C by all the
 212    tables in the list.  */
 213
 214 int
 215 translate_char (Lisp_Object table, int c)
 216 {
 217   if (CHAR_TABLE_P (table))
 218     {
 219       Lisp_Object ch;
 220
 221       ch = CHAR_TABLE_REF (table, c);
 222       if (CHARACTERP (ch))
 223         c = XINT (ch);
 224     }
 225   else
 226     {
 227       for (; CONSP (table); table = XCDR (table))
 228         c = translate_char (XCAR (table), c);
 229     }
 230   return c;
 231 }
 232
 233 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 234    them, return (C & 0xFF).  */
 235
 236 int
 237 multibyte_char_to_unibyte (int c)
 238 {
 239   if (c < 0x80)
 240     return c;
 241   if (CHAR_BYTE8_P (c))
 242     return CHAR_TO_BYTE8 (c);
 243   return (c & 0xFF);
 244 }
 245
 246 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 247    by charset_unibyte.  */
 248
 249 int
 250 multibyte_char_to_unibyte_safe (int c)
 251 {
 252   if (c < 0x80)
 253     return c;
 254   if (CHAR_BYTE8_P (c))
 255     return CHAR_TO_BYTE8 (c);
 256   return -1;
 257 }
 258
 259 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 260        doc: /* Return non-nil if OBJECT is a character.
 261 usage: (characterp OBJECT)  */)
 262   (Lisp_Object object, Lisp_Object ignore)
 263 {
 264   return (CHARACTERP (object) ? Qt : Qnil);
 265 }
 266
 267 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 268        doc: /* Return the character of the maximum code.  */)
 269   (void)
 270 {
 271   return make_number (MAX_CHAR);
 272 }
 273
 274 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 275        Sunibyte_char_to_multibyte, 1, 1, 0,
 276        doc: /* Convert the byte CH to multibyte character.  */)
 277   (Lisp_Object ch)
 278 {
 279   int c;
 280
 281   CHECK_CHARACTER (ch);
 282   c = XFASTINT (ch);
 283   if (c >= 0x100)
 284     error ("Not a unibyte character: %d", c);
 285   MAKE_CHAR_MULTIBYTE (c);
 286   return make_number (c);
 287 }
 288
 289 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 290        Smultibyte_char_to_unibyte, 1, 1, 0,
 291        doc: /* Convert the multibyte character CH to a byte.
 292 If the multibyte character does not represent a byte, return -1.  */)
 293   (Lisp_Object ch)
 294 {
 295   int cm;
 296
 297   CHECK_CHARACTER (ch);
 298   cm = XFASTINT (ch);
 299   if (cm < 256)
 300     /* Can't distinguish a byte read from a unibyte buffer from
 301        a latin1 char, so let's let it slide.  */
 302     return ch;
 303   else
 304     {
 305       int cu = CHAR_TO_BYTE_SAFE (cm);
 306       return make_number (cu);
 307     }
 308 }
 309
 310
 311 /* Return width (columns) of C considering the buffer display table DP. */
 312
 313 static ptrdiff_t
 314 char_width (int c, struct Lisp_Char_Table *dp)
 315 {
 316   ptrdiff_t width = CHAR_WIDTH (c);
 317
 318   if (dp)
 319     {
 320       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 321       int i;
 322
 323       if (VECTORP (disp))
 324         for (i = 0, width = 0; i < ASIZE (disp); i++)
 325           {
 326             ch = AREF (disp, i);
 327             if (CHARACTERP (ch))
 328               {
 329                 int w = CHAR_WIDTH (XFASTINT (ch));
 330                 if (INT_ADD_OVERFLOW (width, w))
 331                   string_overflow ();
 332                 width += w;
 333               }
 334           }
 335     }
 336   return width;
 337 }
 338
 339
 340 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 341        doc: /* Return width of CHAR when displayed in the current buffer.
 342 The width is measured by how many columns it occupies on the screen.
 343 Tab is taken to occupy `tab-width' columns.
 344 usage: (char-width CHAR)  */)
 345   (Lisp_Object ch)
 346 {
 347   int c;
 348   ptrdiff_t width;
 349
 350   CHECK_CHARACTER (ch);
 351   c = XINT (ch);
 352   width = char_width (c, buffer_display_table ());
 353   return make_number (width);
 354 }
 355
 356 /* Return width of string STR of length LEN when displayed in the
 357    current buffer.  The width is measured by how many columns it
 358    occupies on the screen.  If PRECISION > 0, return the width of
 359    longest substring that doesn't exceed PRECISION, and set number of
 360    characters and bytes of the substring in *NCHARS and *NBYTES
 361    respectively.  */
 362
 363 ptrdiff_t
 364 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 365                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 366 {
 367   ptrdiff_t i = 0, i_byte = 0;
 368   ptrdiff_t width = 0;
 369   struct Lisp_Char_Table *dp = buffer_display_table ();
 370
 371   while (i_byte < len)
 372     {
 373       int bytes;
 374       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 375       ptrdiff_t thiswidth = char_width (c, dp);
 376
 377       if (precision <= 0)
 378         {
 379           if (INT_ADD_OVERFLOW (width, thiswidth))
 380             string_overflow ();
 381         }
 382       else if (precision - width < thiswidth)
 383         {
 384           *nchars = i;
 385           *nbytes = i_byte;
 386           return width;
 387         }
 388       i++;
 389       i_byte += bytes;
 390       width += thiswidth;
 391   }
 392
 393   if (precision > 0)
 394     {
 395       *nchars = i;
 396       *nbytes = i_byte;
 397     }
 398
 399   return width;
 400 }
 401
 402 /* Return width of string STR of length LEN when displayed in the
 403    current buffer.  The width is measured by how many columns it
 404    occupies on the screen.  */
 405
 406 ptrdiff_t
 407 strwidth (const char *str, ptrdiff_t len)
 408 {
 409   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 410 }
 411
 412 /* Return width of Lisp string STRING when displayed in the current
 413    buffer.  The width is measured by how many columns it occupies on
 414    the screen while paying attention to compositions.  If PRECISION >
 415    0, return the width of longest substring that doesn't exceed
 416    PRECISION, and set number of characters and bytes of the substring
 417    in *NCHARS and *NBYTES respectively.  */
 418
 419 ptrdiff_t
 420 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 421                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 422 {
 423   ptrdiff_t len = SCHARS (string);
 424   /* This set multibyte to 0 even if STRING is multibyte when it
 425      contains only ascii and eight-bit-graphic, but that's
 426      intentional.  */
 427   int multibyte = len < SBYTES (string);
 428   unsigned char *str = SDATA (string);
 429   ptrdiff_t i = 0, i_byte = 0;
 430   ptrdiff_t width = 0;
 431   struct Lisp_Char_Table *dp = buffer_display_table ();
 432
 433   while (i < len)
 434     {
 435       ptrdiff_t chars, bytes, thiswidth;
 436       Lisp_Object val;
 437       ptrdiff_t cmp_id;
 438       ptrdiff_t ignore, end;
 439
 440       if (find_composition (i, -1, &ignore, &end, &val, string)
 441           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 442               >= 0))
 443         {
 444           thiswidth = composition_table[cmp_id]->width;
 445           chars = end - i;
 446           bytes = string_char_to_byte (string, end) - i_byte;
 447         }
 448       else
 449         {
 450           int c;
 451
 452           if (multibyte)
 453             {
 454               int cbytes;
 455               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 456               bytes = cbytes;
 457             }
 458           else
 459             c = str[i_byte], bytes = 1;
 460           chars = 1;
 461           thiswidth = char_width (c, dp);
 462         }
 463
 464       if (precision <= 0)
 465         {
 466 #ifdef emacs
 467           if (INT_ADD_OVERFLOW (width, thiswidth))
 468             string_overflow ();
 469 #endif
 470         }
 471       else if (precision - width < thiswidth)
 472         {
 473           *nchars = i;
 474           *nbytes = i_byte;
 475           return width;
 476         }
 477       i += chars;
 478       i_byte += bytes;
 479       width += thiswidth;
 480     }
 481
 482   if (precision > 0)
 483     {
 484       *nchars = i;
 485       *nbytes = i_byte;
 486     }
 487
 488   return width;
 489 }
 490
 491 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 492        doc: /* Return width of STRING when displayed in the current buffer.
 493 Width is measured by how many columns it occupies on the screen.
 494 When calculating width of a multibyte character in STRING,
 495 only the base leading-code is considered; the validity of
 496 the following bytes is not checked.  Tabs in STRING are always
 497 taken to occupy `tab-width' columns.
 498 usage: (string-width STRING)  */)
 499   (Lisp_Object str)
 500 {
 501   Lisp_Object val;
 502
 503   CHECK_STRING (str);
 504   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 505   return val;
 506 }
 507
 508 /* Return the number of characters in the NBYTES bytes at PTR.
 509    This works by looking at the contents and checking for multibyte
 510    sequences while assuming that there's no invalid sequence.
 511    However, if the current buffer has enable-multibyte-characters =
 512    nil, we treat each byte as a character.  */
 513
 514 ptrdiff_t
 515 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 516 {
 517   /* current_buffer is null at early stages of Emacs initialization.  */
 518   if (current_buffer == 0
 519       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 520     return nbytes;
 521
 522   return multibyte_chars_in_text (ptr, nbytes);
 523 }
 524
 525 /* Return the number of characters in the NBYTES bytes at PTR.
 526    This works by looking at the contents and checking for multibyte
 527    sequences while assuming that there's no invalid sequence.  It
 528    ignores enable-multibyte-characters.  */
 529
 530 ptrdiff_t
 531 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 532 {
 533   const unsigned char *endp = ptr + nbytes;
 534   ptrdiff_t chars = 0;
 535
 536   while (ptr < endp)
 537     {
 538       int len = MULTIBYTE_LENGTH (ptr, endp);
 539
 540       if (len == 0)
 541         abort ();
 542       ptr += len;
 543       chars++;
 544     }
 545
 546   return chars;
 547 }
 548
 549 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 550    characters and bytes in it, and store them in *NCHARS and *NBYTES
 551    respectively.  On counting bytes, pay attention to that 8-bit
 552    characters not constructing a valid multibyte sequence are
 553    represented by 2-byte in a multibyte text.  */
 554
 555 void
 556 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 557                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 558 {
 559   const unsigned char *endp = str + len;
 560   int n;
 561   ptrdiff_t chars = 0, bytes = 0;
 562
 563   if (len >= MAX_MULTIBYTE_LENGTH)
 564     {
 565       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 566       while (str < adjusted_endp)
 567         {
 568           if (! CHAR_BYTE8_HEAD_P (*str)
 569               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 570             str += n, bytes += n;
 571           else
 572             str++, bytes += 2;
 573           chars++;
 574         }
 575     }
 576   while (str < endp)
 577     {
 578       if (! CHAR_BYTE8_HEAD_P (*str)
 579           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 580         str += n, bytes += n;
 581       else
 582         str++, bytes += 2;
 583       chars++;
 584     }
 585
 586   *nchars = chars;
 587   *nbytes = bytes;
 588   return;
 589 }
 590
 591 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 592    It actually converts only such 8-bit characters that don't construct
 593    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 594    NCHARS is nonzero, set *NCHARS to the number of characters in the
 595    text.  It is assured that we can use LEN bytes at STR as a work
 596    area and that is enough.  Return the number of bytes of the
 597    resulting text.  */
 598
 599 ptrdiff_t
 600 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 601                   ptrdiff_t *nchars)
 602 {
 603   unsigned char *p = str, *endp = str + nbytes;
 604   unsigned char *to;
 605   ptrdiff_t chars = 0;
 606   int n;
 607
 608   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 609     {
 610       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 611       while (p < adjusted_endp
 612              && ! CHAR_BYTE8_HEAD_P (*p)
 613              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 614         p += n, chars++;
 615     }
 616   while (p < endp
 617          && ! CHAR_BYTE8_HEAD_P (*p)
 618          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 619     p += n, chars++;
 620   if (nchars)
 621     *nchars = chars;
 622   if (p == endp)
 623     return nbytes;
 624
 625   to = p;
 626   nbytes = endp - p;
 627   endp = str + len;
 628   memmove (endp - nbytes, p, nbytes);
 629   p = endp - nbytes;
 630
 631   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 632     {
 633       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 634       while (p < adjusted_endp)
 635         {
 636           if (! CHAR_BYTE8_HEAD_P (*p)
 637               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 638             {
 639               while (n--)
 640                 *to++ = *p++;
 641             }
 642           else
 643             {
 644               int c = *p++;
 645               c = BYTE8_TO_CHAR (c);
 646               to += CHAR_STRING (c, to);
 647             }
 648         }
 649       chars++;
 650     }
 651   while (p < endp)
 652     {
 653       if (! CHAR_BYTE8_HEAD_P (*p)
 654           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 655         {
 656           while (n--)
 657             *to++ = *p++;
 658         }
 659       else
 660         {
 661           int c = *p++;
 662           c = BYTE8_TO_CHAR (c);
 663           to += CHAR_STRING (c, to);
 664         }
 665       chars++;
 666     }
 667   if (nchars)
 668     *nchars = chars;
 669   return (to - str);
 670 }
 671
 672 /* Parse unibyte string at STR of LEN bytes, and return the number of
 673    bytes it may occupy when converted to multibyte string by
 674    `str_to_multibyte'.  */
 675
 676 ptrdiff_t
 677 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 678 {
 679   const unsigned char *endp = str + len;
 680   ptrdiff_t bytes;
 681
 682   for (bytes = 0; str < endp; str++)
 683     {
 684       int n = *str < 0x80 ? 1 : 2;
 685       if (INT_ADD_OVERFLOW (bytes, n))
 686         string_overflow ();
 687       bytes += n;
 688     }
 689   return bytes;
 690 }
 691
 692
 693 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 694    that contains the same single-byte characters.  It actually
 695    converts all 8-bit characters to multibyte forms.  It is assured
 696    that we can use LEN bytes at STR as a work area and that is
 697    enough.  */
 698
 699 ptrdiff_t
 700 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 701 {
 702   unsigned char *p = str, *endp = str + bytes;
 703   unsigned char *to;
 704
 705   while (p < endp && *p < 0x80) p++;
 706   if (p == endp)
 707     return bytes;
 708   to = p;
 709   bytes = endp - p;
 710   endp = str + len;
 711   memmove (endp - bytes, p, bytes);
 712   p = endp - bytes;
 713   while (p < endp)
 714     {
 715       int c = *p++;
 716
 717       if (c >= 0x80)
 718         c = BYTE8_TO_CHAR (c);
 719       to += CHAR_STRING (c, to);
 720     }
 721   return (to - str);
 722 }
 723
 724 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 725    actually converts characters in the range 0x80..0xFF to
 726    unibyte.  */
 727
 728 ptrdiff_t
 729 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 730 {
 731   const unsigned char *p = str, *endp = str + bytes;
 732   unsigned char *to;
 733   int c, len;
 734
 735   while (p < endp)
 736     {
 737       c = *p;
 738       len = BYTES_BY_CHAR_HEAD (c);
 739       if (CHAR_BYTE8_HEAD_P (c))
 740         break;
 741       p += len;
 742     }
 743   to = str + (p - str);
 744   while (p < endp)
 745     {
 746       c = *p;
 747       len = BYTES_BY_CHAR_HEAD (c);
 748       if (CHAR_BYTE8_HEAD_P (c))
 749         {
 750           c = STRING_CHAR_ADVANCE (p);
 751           *to++ = CHAR_TO_BYTE8 (c);
 752         }
 753       else
 754         {
 755           while (len--) *to++ = *p++;
 756         }
 757     }
 758   return (to - str);
 759 }
 760
 761 /* Convert eight-bit chars in SRC (in multibyte form) to the
 762    corresponding byte and store in DST.  CHARS is the number of
 763    characters in SRC.  The value is the number of bytes stored in DST.
 764    Usually, the value is the same as CHARS, but is less than it if SRC
 765    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 766    is nonzero, a Latin-1 character is accepted and converted to a byte
 767    of that character code.
 768    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 769
 770 ptrdiff_t
 771 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars, int accept_latin_1)
 772 {
 773   ptrdiff_t i;
 774
 775   for (i = 0; i < chars; i++)
 776     {
 777       int c = STRING_CHAR_ADVANCE (src);
 778
 779       if (CHAR_BYTE8_P (c))
 780         c = CHAR_TO_BYTE8 (c);
 781       else if (! ASCII_CHAR_P (c)
 782                && (! accept_latin_1 || c >= 0x100))
 783         return i;
 784       *dst++ = c;
 785     }
 786   return i;
 787 }
 788
 789
 790 static ptrdiff_t
 791 string_count_byte8 (Lisp_Object string)
 792 {
 793   int multibyte = STRING_MULTIBYTE (string);
 794   ptrdiff_t nbytes = SBYTES (string);
 795   unsigned char *p = SDATA (string);
 796   unsigned char *pend = p + nbytes;
 797   ptrdiff_t count = 0;
 798   int c, len;
 799
 800   if (multibyte)
 801     while (p < pend)
 802       {
 803         c = *p;
 804         len = BYTES_BY_CHAR_HEAD (c);
 805
 806         if (CHAR_BYTE8_HEAD_P (c))
 807           count++;
 808         p += len;
 809       }
 810   else
 811     while (p < pend)
 812       {
 813         if (*p++ >= 0x80)
 814           count++;
 815       }
 816   return count;
 817 }
 818
 819
 820 Lisp_Object
 821 string_escape_byte8 (Lisp_Object string)
 822 {
 823   ptrdiff_t nchars = SCHARS (string);
 824   ptrdiff_t nbytes = SBYTES (string);
 825   int multibyte = STRING_MULTIBYTE (string);
 826   ptrdiff_t byte8_count;
 827   const unsigned char *src, *src_end;
 828   unsigned char *dst;
 829   Lisp_Object val;
 830   int c, len;
 831
 832   if (multibyte && nchars == nbytes)
 833     return string;
 834
 835   byte8_count = string_count_byte8 (string);
 836
 837   if (byte8_count == 0)
 838     return string;
 839
 840   if (multibyte)
 841     {
 842       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 843           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 844         string_overflow ();
 845
 846       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 847       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 848                                           nbytes + byte8_count * 2);
 849     }
 850   else
 851     {
 852       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 853         string_overflow ();
 854
 855       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 856       val = make_uninit_string (nbytes + byte8_count * 3);
 857     }
 858
 859   src = SDATA (string);
 860   src_end = src + nbytes;
 861   dst = SDATA (val);
 862   if (multibyte)
 863     while (src < src_end)
 864       {
 865         c = *src;
 866         len = BYTES_BY_CHAR_HEAD (c);
 867
 868         if (CHAR_BYTE8_HEAD_P (c))
 869           {
 870             c = STRING_CHAR_ADVANCE (src);
 871             c = CHAR_TO_BYTE8 (c);
 872             dst += sprintf ((char *) dst, "\\%03o", c);
 873           }
 874         else
 875           while (len--) *dst++ = *src++;
 876       }
 877   else
 878     while (src < src_end)
 879       {
 880         c = *src++;
 881         if (c >= 0x80)
 882           dst += sprintf ((char *) dst, "\\%03o", c);
 883         else
 884           *dst++ = c;
 885       }
 886   return val;
 887 }
 888
 889 \f
 890 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 891        doc: /*
 892 Concatenate all the argument characters and make the result a string.
 893 usage: (string &rest CHARACTERS)  */)
 894   (ptrdiff_t n, Lisp_Object *args)
 895 {
 896   ptrdiff_t i;
 897   int c;
 898   unsigned char *buf, *p;
 899   Lisp_Object str;
 900   USE_SAFE_ALLOCA;
 901
 902   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 903   p = buf;
 904
 905   for (i = 0; i < n; i++)
 906     {
 907       CHECK_CHARACTER (args[i]);
 908       c = XINT (args[i]);
 909       p += CHAR_STRING (c, p);
 910     }
 911
 912   str = make_string_from_bytes ((char *) buf, n, p - buf);
 913   SAFE_FREE ();
 914   return str;
 915 }
 916
 917 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 918        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 919 usage: (unibyte-string &rest BYTES)  */)
 920   (ptrdiff_t n, Lisp_Object *args)
 921 {
 922   ptrdiff_t i;
 923   Lisp_Object str;
 924   USE_SAFE_ALLOCA;
 925   unsigned char *buf = SAFE_ALLOCA (n);
 926   unsigned char *p = buf;
 927
 928   for (i = 0; i < n; i++)
 929     {
 930       CHECK_RANGED_INTEGER (args[i], 0, 255);
 931       *p++ = XINT (args[i]);
 932     }
 933
 934   str = make_string_from_bytes ((char *) buf, n, p - buf);
 935   SAFE_FREE ();
 936   return str;
 937 }
 938
 939 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 940        Schar_resolve_modifiers, 1, 1, 0,
 941        doc: /* Resolve modifiers in the character CHAR.
 942 The value is a character with modifiers resolved into the character
 943 code.  Unresolved modifiers are kept in the value.
 944 usage: (char-resolve-modifiers CHAR)  */)
 945   (Lisp_Object character)
 946 {
 947   EMACS_INT c;
 948
 949   CHECK_NUMBER (character);
 950   c = XINT (character);
 951   return make_number (char_resolve_modifier_mask (c));
 952 }
 953
 954 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 955        doc: /* Return a byte value of a character at point.
 956 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 957 a byte value.
 958 Optional 2nd arg STRING, if non-nil, is a string of which first
 959 character is a target to get a byte value.  In this case, POSITION, if
 960 non-nil, is an index of a target character in the string.
 961
 962 If the current buffer (or STRING) is multibyte, and the target
 963 character is not ASCII nor 8-bit character, an error is signaled.  */)
 964   (Lisp_Object position, Lisp_Object string)
 965 {
 966   int c;
 967   ptrdiff_t pos;
 968   unsigned char *p;
 969
 970   if (NILP (string))
 971     {
 972       if (NILP (position))
 973         {
 974           p = PT_ADDR;
 975         }
 976       else
 977         {
 978           CHECK_NUMBER_COERCE_MARKER (position);
 979           if (XINT (position) < BEGV || XINT (position) >= ZV)
 980             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 981           pos = XFASTINT (position);
 982           p = CHAR_POS_ADDR (pos);
 983         }
 984       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 985         return make_number (*p);
 986     }
 987   else
 988     {
 989       CHECK_STRING (string);
 990       if (NILP (position))
 991         {
 992           p = SDATA (string);
 993         }
 994       else
 995         {
 996           CHECK_NATNUM (position);
 997           if (XINT (position) >= SCHARS (string))
 998             args_out_of_range (string, position);
 999           pos = XFASTINT (position);
1000           p = SDATA (string) + string_char_to_byte (string, pos);
1001         }
1002       if (! STRING_MULTIBYTE (string))
1003         return make_number (*p);
1004     }
1005   c = STRING_CHAR (p);
1006   if (CHAR_BYTE8_P (c))
1007     c = CHAR_TO_BYTE8 (c);
1008   else if (! ASCII_CHAR_P (c))
1009     error ("Not an ASCII nor an 8-bit character: %d", c);
1010   return make_number (c);
1011 }
1012
1013 #ifdef emacs
1014
1015 void
1016 syms_of_character (void)
1017 {
1018   DEFSYM (Qcharacterp, "characterp");
1019   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1020
1021   staticpro (&Vchar_unify_table);
1022   Vchar_unify_table = Qnil;
1023
1024   defsubr (&Smax_char);
1025   defsubr (&Scharacterp);
1026   defsubr (&Sunibyte_char_to_multibyte);
1027   defsubr (&Smultibyte_char_to_unibyte);
1028   defsubr (&Schar_width);
1029   defsubr (&Sstring_width);
1030   defsubr (&Sstring);
1031   defsubr (&Sunibyte_string);
1032   defsubr (&Schar_resolve_modifiers);
1033   defsubr (&Sget_byte);
1034
1035   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1036                doc: /*
1037 Vector recording all translation tables ever defined.
1038 Each element is a pair (SYMBOL . TABLE) relating the table to the
1039 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1040   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1041
1042   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1043                doc: /*
1044 A char-table for characters which invoke auto-filling.
1045 Such characters have value t in this table.  */);
1046   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1047   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1048   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1049
1050   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1051                doc: /*
1052 A char-table for width (columns) of each character.  */);
1053   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1054   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1055   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1056                         make_number (4));
1057
1058   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1059                doc: /* A char-table for each printable character.  */);
1060   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1061   Fset_char_table_range (Vprintable_chars,
1062                          Fcons (make_number (32), make_number (126)), Qt);
1063   Fset_char_table_range (Vprintable_chars,
1064                          Fcons (make_number (160),
1065                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1066
1067   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1068                doc: /* Char table of script symbols.
1069 It has one extra slot whose value is a list of script symbols.  */);
1070
1071   /* Intern this now in case it isn't already done.
1072      Setting this variable twice is harmless.
1073      But don't staticpro it here--that is done in alloc.c.  */
1074   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1075   DEFSYM (Qchar_script_table, "char-script-table");
1076   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1077   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1078
1079   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1080                doc: /* Alist of scripts vs the representative characters.
1081 Each element is a cons (SCRIPT . CHARS).
1082 SCRIPT is a symbol representing a script or a subgroup of a script.
1083 CHARS is a list or a vector of characters.
1084 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1085 If it is a vector, one of the characters in the vector is necessary.
1086 This variable is used to find a font for a specific script.  */);
1087   Vscript_representative_chars = Qnil;
1088
1089   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1090                doc: /* Char table of Unicode's "General Category".
1091 All Unicode characters have one of the following values (symbol):
1092   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1093   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1094 See The Unicode Standard for the meaning of those values.  */);
1095   /* The correct char-table is setup in characters.el.  */
1096   Vunicode_category_table = Qnil;
1097 }
1098
1099 #endif /* emacs */