src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  61 unsigned char *_fetch_multibyte_char_p;
  62
  63 static Lisp_Object Qchar_script_table;
  64
  65 \f
  66
  67 /* If character code C has modifier masks, reflect them to the
  68    character code if possible.  Return the resulting code.  */
  69
  70 int
  71 char_resolve_modifier_mask (int c)
  72 {
  73   /* A non-ASCII character can't reflect modifier bits to the code.  */
  74   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  75     return c;
  76
  77   /* For Meta, Shift, and Control modifiers, we need special care.  */
  78   if (c & CHAR_SHIFT)
  79     {
  80       /* Shift modifier is valid only with [A-Za-z].  */
  81       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  82         c &= ~CHAR_SHIFT;
  83       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  84         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  85       /* Shift modifier for control characters and SPC is ignored.  */
  86       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  87         c &= ~CHAR_SHIFT;
  88     }
  89   if (c & CHAR_CTL)
  90     {
  91       /* Simulate the code in lread.c.  */
  92       /* Allow `\C- ' and `\C-?'.  */
  93       if ((c & 0377) == ' ')
  94         c &= ~0177 & ~ CHAR_CTL;
  95       else if ((c & 0377) == '?')
  96         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  97       /* ASCII control chars are made from letters (both cases),
  98          as well as the non-letters within 0100...0137.  */
  99       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 102         c &= (037 | (~0177 & ~CHAR_CTL));
 103     }
 104 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 105   if (c & CHAR_META)
 106     {
 107       /* Move the meta bit to the right place for a string.  */
 108       c = (c & ~CHAR_META) | 0x80;
 109     }
 110 #endif
 111
 112   return c;
 113 }
 114
 115
 116 /* Store multibyte form of character C at P.  If C has modifier bits,
 117    handle them appropriately.  */
 118
 119 int
 120 char_string (unsigned int c, unsigned char *p)
 121 {
 122   int bytes;
 123
 124   if (c & CHAR_MODIFIER_MASK)
 125     {
 126       c = char_resolve_modifier_mask (c);
 127       /* If C still has any modifier bits, just ignore it.  */
 128       c &= ~CHAR_MODIFIER_MASK;
 129     }
 130
 131   MAYBE_UNIFY_CHAR (c);
 132
 133   if (c <= MAX_3_BYTE_CHAR)
 134     {
 135       bytes = CHAR_STRING (c, p);
 136     }
 137   else if (c <= MAX_4_BYTE_CHAR)
 138     {
 139       p[0] = (0xF0 | (c >> 18));
 140       p[1] = (0x80 | ((c >> 12) & 0x3F));
 141       p[2] = (0x80 | ((c >> 6) & 0x3F));
 142       p[3] = (0x80 | (c & 0x3F));
 143       bytes = 4;
 144     }
 145   else if (c <= MAX_5_BYTE_CHAR)
 146     {
 147       p[0] = 0xF8;
 148       p[1] = (0x80 | ((c >> 18) & 0x0F));
 149       p[2] = (0x80 | ((c >> 12) & 0x3F));
 150       p[3] = (0x80 | ((c >> 6) & 0x3F));
 151       p[4] = (0x80 | (c & 0x3F));
 152       bytes = 5;
 153     }
 154   else if (c <= MAX_CHAR)
 155     {
 156       c = CHAR_TO_BYTE8 (c);
 157       bytes = BYTE8_STRING (c, p);
 158     }
 159   else
 160     error ("Invalid character: %x", c);
 161
 162   return bytes;
 163 }
 164
 165
 166 /* Return a character whose multibyte form is at P.  If LEN is not
 167    NULL, it must be a pointer to integer.  In that case, set *LEN to
 168    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 169    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 170    the ending address (i.e., the starting address of the next
 171    character) of the multibyte form.  */
 172
 173 int
 174 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If no translation is
 211    found in TABLE, return the untranslated character.  If TABLE is a list,
 212    elements are char tables.  In that case, recursively translate C by all the
 213    tables in the list.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.
 262 usage: (characterp OBJECT)  */)
 263   (Lisp_Object object, Lisp_Object ignore)
 264 {
 265   return (CHARACTERP (object) ? Qt : Qnil);
 266 }
 267
 268 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 269        doc: /* Return the character of the maximum code.  */)
 270   (void)
 271 {
 272   return make_number (MAX_CHAR);
 273 }
 274
 275 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 276        Sunibyte_char_to_multibyte, 1, 1, 0,
 277        doc: /* Convert the byte CH to multibyte character.  */)
 278   (Lisp_Object ch)
 279 {
 280   int c;
 281
 282   CHECK_CHARACTER (ch);
 283   c = XFASTINT (ch);
 284   if (c >= 0x100)
 285     error ("Not a unibyte character: %d", c);
 286   MAKE_CHAR_MULTIBYTE (c);
 287   return make_number (c);
 288 }
 289
 290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 291        Smultibyte_char_to_unibyte, 1, 1, 0,
 292        doc: /* Convert the multibyte character CH to a byte.
 293 If the multibyte character does not represent a byte, return -1.  */)
 294   (Lisp_Object ch)
 295 {
 296   int cm;
 297
 298   CHECK_CHARACTER (ch);
 299   cm = XFASTINT (ch);
 300   if (cm < 256)
 301     /* Can't distinguish a byte read from a unibyte buffer from
 302        a latin1 char, so let's let it slide.  */
 303     return ch;
 304   else
 305     {
 306       int cu = CHAR_TO_BYTE_SAFE (cm);
 307       return make_number (cu);
 308     }
 309 }
 310
 311
 312 /* Return width (columns) of C considering the buffer display table DP. */
 313
 314 static EMACS_INT
 315 char_width (int c, struct Lisp_Char_Table *dp)
 316 {
 317   EMACS_INT width = CHAR_WIDTH (c);
 318
 319   if (dp)
 320     {
 321       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 322       int i;
 323
 324       if (VECTORP (disp))
 325         for (i = 0, width = 0; i < ASIZE (disp); i++)
 326           {
 327             ch = AREF (disp, i);
 328             if (CHARACTERP (ch))
 329               {
 330                 int w = CHAR_WIDTH (XFASTINT (ch));
 331                 if (INT_ADD_OVERFLOW (width, w))
 332                   string_overflow ();
 333                 width += w;
 334               }
 335           }
 336     }
 337   return width;
 338 }
 339
 340
 341 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 342        doc: /* Return width of CHAR when displayed in the current buffer.
 343 The width is measured by how many columns it occupies on the screen.
 344 Tab is taken to occupy `tab-width' columns.
 345 usage: (char-width CHAR)  */)
 346   (Lisp_Object ch)
 347 {
 348   int c;
 349   EMACS_INT width;
 350
 351   CHECK_CHARACTER (ch);
 352   c = XINT (ch);
 353   width = char_width (c, buffer_display_table ());
 354   return make_number (width);
 355 }
 356
 357 /* Return width of string STR of length LEN when displayed in the
 358    current buffer.  The width is measured by how many columns it
 359    occupies on the screen.  If PRECISION > 0, return the width of
 360    longest substring that doesn't exceed PRECISION, and set number of
 361    characters and bytes of the substring in *NCHARS and *NBYTES
 362    respectively.  */
 363
 364 EMACS_INT
 365 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 366                 EMACS_INT *nchars, EMACS_INT *nbytes)
 367 {
 368   EMACS_INT i = 0, i_byte = 0;
 369   EMACS_INT width = 0;
 370   struct Lisp_Char_Table *dp = buffer_display_table ();
 371
 372   while (i_byte < len)
 373     {
 374       int bytes;
 375       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 376       EMACS_INT thiswidth = char_width (c, dp);
 377
 378       if (precision <= 0)
 379         {
 380           if (INT_ADD_OVERFLOW (width, thiswidth))
 381             string_overflow ();
 382         }
 383       else if (precision - width < thiswidth)
 384         {
 385           *nchars = i;
 386           *nbytes = i_byte;
 387           return width;
 388         }
 389       i++;
 390       i_byte += bytes;
 391       width += thiswidth;
 392   }
 393
 394   if (precision > 0)
 395     {
 396       *nchars = i;
 397       *nbytes = i_byte;
 398     }
 399
 400   return width;
 401 }
 402
 403 /* Return width of string STR of length LEN when displayed in the
 404    current buffer.  The width is measured by how many columns it
 405    occupies on the screen.  */
 406
 407 EMACS_INT
 408 strwidth (const char *str, EMACS_INT len)
 409 {
 410   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 411 }
 412
 413 /* Return width of Lisp string STRING when displayed in the current
 414    buffer.  The width is measured by how many columns it occupies on
 415    the screen while paying attention to compositions.  If PRECISION >
 416    0, return the width of longest substring that doesn't exceed
 417    PRECISION, and set number of characters and bytes of the substring
 418    in *NCHARS and *NBYTES respectively.  */
 419
 420 EMACS_INT
 421 lisp_string_width (Lisp_Object string, EMACS_INT precision,
 422                    EMACS_INT *nchars, EMACS_INT *nbytes)
 423 {
 424   EMACS_INT len = SCHARS (string);
 425   /* This set multibyte to 0 even if STRING is multibyte when it
 426      contains only ascii and eight-bit-graphic, but that's
 427      intentional.  */
 428   int multibyte = len < SBYTES (string);
 429   unsigned char *str = SDATA (string);
 430   EMACS_INT i = 0, i_byte = 0;
 431   EMACS_INT width = 0;
 432   struct Lisp_Char_Table *dp = buffer_display_table ();
 433
 434   while (i < len)
 435     {
 436       EMACS_INT chars, bytes, thiswidth;
 437       Lisp_Object val;
 438       ptrdiff_t cmp_id;
 439       EMACS_INT ignore, end;
 440
 441       if (find_composition (i, -1, &ignore, &end, &val, string)
 442           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 443               >= 0))
 444         {
 445           thiswidth = composition_table[cmp_id]->width;
 446           chars = end - i;
 447           bytes = string_char_to_byte (string, end) - i_byte;
 448         }
 449       else
 450         {
 451           int c;
 452
 453           if (multibyte)
 454             {
 455               int cbytes;
 456               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 457               bytes = cbytes;
 458             }
 459           else
 460             c = str[i_byte], bytes = 1;
 461           chars = 1;
 462           thiswidth = char_width (c, dp);
 463         }
 464
 465       if (precision <= 0)
 466         {
 467 #ifdef emacs
 468           if (INT_ADD_OVERFLOW (width, thiswidth))
 469             string_overflow ();
 470 #endif
 471         }
 472       else if (precision - width < thiswidth)
 473         {
 474           *nchars = i;
 475           *nbytes = i_byte;
 476           return width;
 477         }
 478       i += chars;
 479       i_byte += bytes;
 480       width += thiswidth;
 481     }
 482
 483   if (precision > 0)
 484     {
 485       *nchars = i;
 486       *nbytes = i_byte;
 487     }
 488
 489   return width;
 490 }
 491
 492 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 493        doc: /* Return width of STRING when displayed in the current buffer.
 494 Width is measured by how many columns it occupies on the screen.
 495 When calculating width of a multibyte character in STRING,
 496 only the base leading-code is considered; the validity of
 497 the following bytes is not checked.  Tabs in STRING are always
 498 taken to occupy `tab-width' columns.
 499 usage: (string-width STRING)  */)
 500   (Lisp_Object str)
 501 {
 502   Lisp_Object val;
 503
 504   CHECK_STRING (str);
 505   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 506   return val;
 507 }
 508
 509 /* Return the number of characters in the NBYTES bytes at PTR.
 510    This works by looking at the contents and checking for multibyte
 511    sequences while assuming that there's no invalid sequence.
 512    However, if the current buffer has enable-multibyte-characters =
 513    nil, we treat each byte as a character.  */
 514
 515 EMACS_INT
 516 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 517 {
 518   /* current_buffer is null at early stages of Emacs initialization.  */
 519   if (current_buffer == 0
 520       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 521     return nbytes;
 522
 523   return multibyte_chars_in_text (ptr, nbytes);
 524 }
 525
 526 /* Return the number of characters in the NBYTES bytes at PTR.
 527    This works by looking at the contents and checking for multibyte
 528    sequences while assuming that there's no invalid sequence.  It
 529    ignores enable-multibyte-characters.  */
 530
 531 EMACS_INT
 532 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 533 {
 534   const unsigned char *endp = ptr + nbytes;
 535   EMACS_INT chars = 0;
 536
 537   while (ptr < endp)
 538     {
 539       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 540
 541       if (len == 0)
 542         abort ();
 543       ptr += len;
 544       chars++;
 545     }
 546
 547   return chars;
 548 }
 549
 550 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 551    characters and bytes in it, and store them in *NCHARS and *NBYTES
 552    respectively.  On counting bytes, pay attention to that 8-bit
 553    characters not constructing a valid multibyte sequence are
 554    represented by 2-byte in a multibyte text.  */
 555
 556 void
 557 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 558                         EMACS_INT *nchars, EMACS_INT *nbytes)
 559 {
 560   const unsigned char *endp = str + len;
 561   EMACS_INT n, chars = 0, bytes = 0;
 562
 563   if (len >= MAX_MULTIBYTE_LENGTH)
 564     {
 565       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 566       while (str < adjusted_endp)
 567         {
 568           if (! CHAR_BYTE8_HEAD_P (*str)
 569               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 570             str += n, bytes += n;
 571           else
 572             str++, bytes += 2;
 573           chars++;
 574         }
 575     }
 576   while (str < endp)
 577     {
 578       if (! CHAR_BYTE8_HEAD_P (*str)
 579           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 580         str += n, bytes += n;
 581       else
 582         str++, bytes += 2;
 583       chars++;
 584     }
 585
 586   *nchars = chars;
 587   *nbytes = bytes;
 588   return;
 589 }
 590
 591 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 592    It actually converts only such 8-bit characters that don't construct
 593    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 594    NCHARS is nonzero, set *NCHARS to the number of characters in the
 595    text.  It is assured that we can use LEN bytes at STR as a work
 596    area and that is enough.  Return the number of bytes of the
 597    resulting text.  */
 598
 599 EMACS_INT
 600 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 601                   EMACS_INT *nchars)
 602 {
 603   unsigned char *p = str, *endp = str + nbytes;
 604   unsigned char *to;
 605   EMACS_INT chars = 0;
 606   int n;
 607
 608   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 609     {
 610       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 611       while (p < adjusted_endp
 612              && ! CHAR_BYTE8_HEAD_P (*p)
 613              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 614         p += n, chars++;
 615     }
 616   while (p < endp
 617          && ! CHAR_BYTE8_HEAD_P (*p)
 618          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 619     p += n, chars++;
 620   if (nchars)
 621     *nchars = chars;
 622   if (p == endp)
 623     return nbytes;
 624
 625   to = p;
 626   nbytes = endp - p;
 627   endp = str + len;
 628   memmove (endp - nbytes, p, nbytes);
 629   p = endp - nbytes;
 630
 631   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 632     {
 633       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 634       while (p < adjusted_endp)
 635         {
 636           if (! CHAR_BYTE8_HEAD_P (*p)
 637               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 638             {
 639               while (n--)
 640                 *to++ = *p++;
 641             }
 642           else
 643             {
 644               int c = *p++;
 645               c = BYTE8_TO_CHAR (c);
 646               to += CHAR_STRING (c, to);
 647             }
 648         }
 649       chars++;
 650     }
 651   while (p < endp)
 652     {
 653       if (! CHAR_BYTE8_HEAD_P (*p)
 654           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 655         {
 656           while (n--)
 657             *to++ = *p++;
 658         }
 659       else
 660         {
 661           int c = *p++;
 662           c = BYTE8_TO_CHAR (c);
 663           to += CHAR_STRING (c, to);
 664         }
 665       chars++;
 666     }
 667   if (nchars)
 668     *nchars = chars;
 669   return (to - str);
 670 }
 671
 672 /* Parse unibyte string at STR of LEN bytes, and return the number of
 673    bytes it may occupy when converted to multibyte string by
 674    `str_to_multibyte'.  */
 675
 676 EMACS_INT
 677 count_size_as_multibyte (const unsigned char *str, EMACS_INT len)
 678 {
 679   const unsigned char *endp = str + len;
 680   EMACS_INT bytes;
 681
 682   for (bytes = 0; str < endp; str++)
 683     {
 684       int n = *str < 0x80 ? 1 : 2;
 685       if (INT_ADD_OVERFLOW (bytes, n))
 686         string_overflow ();
 687       bytes += n;
 688     }
 689   return bytes;
 690 }
 691
 692
 693 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 694    that contains the same single-byte characters.  It actually
 695    converts all 8-bit characters to multibyte forms.  It is assured
 696    that we can use LEN bytes at STR as a work area and that is
 697    enough.  */
 698
 699 EMACS_INT
 700 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 701 {
 702   unsigned char *p = str, *endp = str + bytes;
 703   unsigned char *to;
 704
 705   while (p < endp && *p < 0x80) p++;
 706   if (p == endp)
 707     return bytes;
 708   to = p;
 709   bytes = endp - p;
 710   endp = str + len;
 711   memmove (endp - bytes, p, bytes);
 712   p = endp - bytes;
 713   while (p < endp)
 714     {
 715       int c = *p++;
 716
 717       if (c >= 0x80)
 718         c = BYTE8_TO_CHAR (c);
 719       to += CHAR_STRING (c, to);
 720     }
 721   return (to - str);
 722 }
 723
 724 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 725    actually converts characters in the range 0x80..0xFF to
 726    unibyte.  */
 727
 728 EMACS_INT
 729 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 730 {
 731   const unsigned char *p = str, *endp = str + bytes;
 732   unsigned char *to;
 733   int c, len;
 734
 735   while (p < endp)
 736     {
 737       c = *p;
 738       len = BYTES_BY_CHAR_HEAD (c);
 739       if (CHAR_BYTE8_HEAD_P (c))
 740         break;
 741       p += len;
 742     }
 743   to = str + (p - str);
 744   while (p < endp)
 745     {
 746       c = *p;
 747       len = BYTES_BY_CHAR_HEAD (c);
 748       if (CHAR_BYTE8_HEAD_P (c))
 749         {
 750           c = STRING_CHAR_ADVANCE (p);
 751           *to++ = CHAR_TO_BYTE8 (c);
 752         }
 753       else
 754         {
 755           while (len--) *to++ = *p++;
 756         }
 757     }
 758   return (to - str);
 759 }
 760
 761 /* Convert eight-bit chars in SRC (in multibyte form) to the
 762    corresponding byte and store in DST.  CHARS is the number of
 763    characters in SRC.  The value is the number of bytes stored in DST.
 764    Usually, the value is the same as CHARS, but is less than it if SRC
 765    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 766    is nonzero, a Latin-1 character is accepted and converted to a byte
 767    of that character code.
 768    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 769
 770 EMACS_INT
 771 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 772 {
 773   EMACS_INT i;
 774
 775   for (i = 0; i < chars; i++)
 776     {
 777       int c = STRING_CHAR_ADVANCE (src);
 778
 779       if (CHAR_BYTE8_P (c))
 780         c = CHAR_TO_BYTE8 (c);
 781       else if (! ASCII_CHAR_P (c)
 782                && (! accept_latin_1 || c >= 0x100))
 783         return i;
 784       *dst++ = c;
 785     }
 786   return i;
 787 }
 788
 789
 790 static EMACS_INT
 791 string_count_byte8 (Lisp_Object string)
 792 {
 793   int multibyte = STRING_MULTIBYTE (string);
 794   EMACS_INT nbytes = SBYTES (string);
 795   unsigned char *p = SDATA (string);
 796   unsigned char *pend = p + nbytes;
 797   EMACS_INT count = 0;
 798   int c, len;
 799
 800   if (multibyte)
 801     while (p < pend)
 802       {
 803         c = *p;
 804         len = BYTES_BY_CHAR_HEAD (c);
 805
 806         if (CHAR_BYTE8_HEAD_P (c))
 807           count++;
 808         p += len;
 809       }
 810   else
 811     while (p < pend)
 812       {
 813         if (*p++ >= 0x80)
 814           count++;
 815       }
 816   return count;
 817 }
 818
 819
 820 Lisp_Object
 821 string_escape_byte8 (Lisp_Object string)
 822 {
 823   EMACS_INT nchars = SCHARS (string);
 824   EMACS_INT nbytes = SBYTES (string);
 825   int multibyte = STRING_MULTIBYTE (string);
 826   EMACS_INT byte8_count;
 827   const unsigned char *src, *src_end;
 828   unsigned char *dst;
 829   Lisp_Object val;
 830   int c, len;
 831
 832   if (multibyte && nchars == nbytes)
 833     return string;
 834
 835   byte8_count = string_count_byte8 (string);
 836
 837   if (byte8_count == 0)
 838     return string;
 839
 840   if (multibyte)
 841     {
 842       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 843           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 844         string_overflow ();
 845
 846       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 847       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 848                                           nbytes + byte8_count * 2);
 849     }
 850   else
 851     {
 852       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 853         string_overflow ();
 854
 855       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 856       val = make_uninit_string (nbytes + byte8_count * 3);
 857     }
 858
 859   src = SDATA (string);
 860   src_end = src + nbytes;
 861   dst = SDATA (val);
 862   if (multibyte)
 863     while (src < src_end)
 864       {
 865         c = *src;
 866         len = BYTES_BY_CHAR_HEAD (c);
 867
 868         if (CHAR_BYTE8_HEAD_P (c))
 869           {
 870             c = STRING_CHAR_ADVANCE (src);
 871             c = CHAR_TO_BYTE8 (c);
 872             sprintf ((char *) dst, "\\%03o", c);
 873             dst += 4;
 874           }
 875         else
 876           while (len--) *dst++ = *src++;
 877       }
 878   else
 879     while (src < src_end)
 880       {
 881         c = *src++;
 882         if (c >= 0x80)
 883           {
 884             sprintf ((char *) dst, "\\%03o", c);
 885             dst += 4;
 886           }
 887         else
 888           *dst++ = c;
 889       }
 890   return val;
 891 }
 892
 893 \f
 894 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 895        doc: /*
 896 Concatenate all the argument characters and make the result a string.
 897 usage: (string &rest CHARACTERS)  */)
 898   (ptrdiff_t n, Lisp_Object *args)
 899 {
 900   ptrdiff_t i;
 901   int c;
 902   unsigned char *buf, *p;
 903   Lisp_Object str;
 904   USE_SAFE_ALLOCA;
 905
 906   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 907   p = buf;
 908
 909   for (i = 0; i < n; i++)
 910     {
 911       CHECK_CHARACTER (args[i]);
 912       c = XINT (args[i]);
 913       p += CHAR_STRING (c, p);
 914     }
 915
 916   str = make_string_from_bytes ((char *) buf, n, p - buf);
 917   SAFE_FREE ();
 918   return str;
 919 }
 920
 921 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 922        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 923 usage: (unibyte-string &rest BYTES)  */)
 924   (ptrdiff_t n, Lisp_Object *args)
 925 {
 926   ptrdiff_t i;
 927   int c;
 928   unsigned char *buf, *p;
 929   Lisp_Object str;
 930   USE_SAFE_ALLOCA;
 931
 932   SAFE_ALLOCA (buf, unsigned char *, n);
 933   p = buf;
 934
 935   for (i = 0; i < n; i++)
 936     {
 937       CHECK_NATNUM (args[i]);
 938       c = XFASTINT (args[i]);
 939       if (c >= 256)
 940         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 941       *p++ = c;
 942     }
 943
 944   str = make_string_from_bytes ((char *) buf, n, p - buf);
 945   SAFE_FREE ();
 946   return str;
 947 }
 948
 949 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 950        Schar_resolve_modifiers, 1, 1, 0,
 951        doc: /* Resolve modifiers in the character CHAR.
 952 The value is a character with modifiers resolved into the character
 953 code.  Unresolved modifiers are kept in the value.
 954 usage: (char-resolve-modifiers CHAR)  */)
 955   (Lisp_Object character)
 956 {
 957   int c;
 958
 959   CHECK_NUMBER (character);
 960   c = XINT (character);
 961   return make_number (char_resolve_modifier_mask (c));
 962 }
 963
 964 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 965        doc: /* Return a byte value of a character at point.
 966 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 967 a byte value.
 968 Optional 2nd arg STRING, if non-nil, is a string of which first
 969 character is a target to get a byte value.  In this case, POSITION, if
 970 non-nil, is an index of a target character in the string.
 971
 972 If the current buffer (or STRING) is multibyte, and the target
 973 character is not ASCII nor 8-bit character, an error is signaled.  */)
 974   (Lisp_Object position, Lisp_Object string)
 975 {
 976   int c;
 977   EMACS_INT pos;
 978   unsigned char *p;
 979
 980   if (NILP (string))
 981     {
 982       if (NILP (position))
 983         {
 984           p = PT_ADDR;
 985         }
 986       else
 987         {
 988           CHECK_NUMBER_COERCE_MARKER (position);
 989           if (XINT (position) < BEGV || XINT (position) >= ZV)
 990             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 991           pos = XFASTINT (position);
 992           p = CHAR_POS_ADDR (pos);
 993         }
 994       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 995         return make_number (*p);
 996     }
 997   else
 998     {
 999       CHECK_STRING (string);
1000       if (NILP (position))
1001         {
1002           p = SDATA (string);
1003         }
1004       else
1005         {
1006           CHECK_NATNUM (position);
1007           if (XINT (position) >= SCHARS (string))
1008             args_out_of_range (string, position);
1009           pos = XFASTINT (position);
1010           p = SDATA (string) + string_char_to_byte (string, pos);
1011         }
1012       if (! STRING_MULTIBYTE (string))
1013         return make_number (*p);
1014     }
1015   c = STRING_CHAR (p);
1016   if (CHAR_BYTE8_P (c))
1017     c = CHAR_TO_BYTE8 (c);
1018   else if (! ASCII_CHAR_P (c))
1019     error ("Not an ASCII nor an 8-bit character: %d", c);
1020   return make_number (c);
1021 }
1022
1023
1024 void
1025 init_character_once (void)
1026 {
1027 }
1028
1029 #ifdef emacs
1030
1031 void
1032 syms_of_character (void)
1033 {
1034   DEFSYM (Qcharacterp, "characterp");
1035   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1036
1037   staticpro (&Vchar_unify_table);
1038   Vchar_unify_table = Qnil;
1039
1040   defsubr (&Smax_char);
1041   defsubr (&Scharacterp);
1042   defsubr (&Sunibyte_char_to_multibyte);
1043   defsubr (&Smultibyte_char_to_unibyte);
1044   defsubr (&Schar_width);
1045   defsubr (&Sstring_width);
1046   defsubr (&Sstring);
1047   defsubr (&Sunibyte_string);
1048   defsubr (&Schar_resolve_modifiers);
1049   defsubr (&Sget_byte);
1050
1051   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1052                doc: /*
1053 Vector recording all translation tables ever defined.
1054 Each element is a pair (SYMBOL . TABLE) relating the table to the
1055 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1056   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1057
1058   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1059                doc: /*
1060 A char-table for characters which invoke auto-filling.
1061 Such characters have value t in this table.  */);
1062   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1063   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1064   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1065
1066   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1067                doc: /*
1068 A char-table for width (columns) of each character.  */);
1069   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1070   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1071   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1072                         make_number (4));
1073
1074   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1075                doc: /* A char-table for each printable character.  */);
1076   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1077   Fset_char_table_range (Vprintable_chars,
1078                          Fcons (make_number (32), make_number (126)), Qt);
1079   Fset_char_table_range (Vprintable_chars,
1080                          Fcons (make_number (160),
1081                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1082
1083   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1084                doc: /* Char table of script symbols.
1085 It has one extra slot whose value is a list of script symbols.  */);
1086
1087   /* Intern this now in case it isn't already done.
1088      Setting this variable twice is harmless.
1089      But don't staticpro it here--that is done in alloc.c.  */
1090   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1091   DEFSYM (Qchar_script_table, "char-script-table");
1092   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1093   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1094
1095   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1096                doc: /* Alist of scripts vs the representative characters.
1097 Each element is a cons (SCRIPT . CHARS).
1098 SCRIPT is a symbol representing a script or a subgroup of a script.
1099 CHARS is a list or a vector of characters.
1100 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1101 If it is a vector, one of the characters in the vector is necessary.
1102 This variable is used to find a font for a specific script.  */);
1103   Vscript_representative_chars = Qnil;
1104
1105   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1106                doc: /* Char table of Unicode's "General Category".
1107 All Unicode characters have one of the following values (symbol):
1108   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1109   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1110 See The Unicode Standard for the meaning of those values.  */);
1111   /* The correct char-table is setup in characters.el.  */
1112   Vunicode_category_table = Qnil;
1113 }
1114
1115 #endif /* emacs */