src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2011  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  61 unsigned char *_fetch_multibyte_char_p;
  62
  63 static Lisp_Object Qchar_script_table;
  64
  65 \f
  66
  67 /* If character code C has modifier masks, reflect them to the
  68    character code if possible.  Return the resulting code.  */
  69
  70 int
  71 char_resolve_modifier_mask (int c)
  72 {
  73   /* A non-ASCII character can't reflect modifier bits to the code.  */
  74   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  75     return c;
  76
  77   /* For Meta, Shift, and Control modifiers, we need special care.  */
  78   if (c & CHAR_SHIFT)
  79     {
  80       /* Shift modifier is valid only with [A-Za-z].  */
  81       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  82         c &= ~CHAR_SHIFT;
  83       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  84         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  85       /* Shift modifier for control characters and SPC is ignored.  */
  86       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  87         c &= ~CHAR_SHIFT;
  88     }
  89   if (c & CHAR_CTL)
  90     {
  91       /* Simulate the code in lread.c.  */
  92       /* Allow `\C- ' and `\C-?'.  */
  93       if ((c & 0377) == ' ')
  94         c &= ~0177 & ~ CHAR_CTL;
  95       else if ((c & 0377) == '?')
  96         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  97       /* ASCII control chars are made from letters (both cases),
  98          as well as the non-letters within 0100...0137.  */
  99       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 102         c &= (037 | (~0177 & ~CHAR_CTL));
 103     }
 104 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 105   if (c & CHAR_META)
 106     {
 107       /* Move the meta bit to the right place for a string.  */
 108       c = (c & ~CHAR_META) | 0x80;
 109     }
 110 #endif
 111
 112   return c;
 113 }
 114
 115
 116 /* Store multibyte form of character C at P.  If C has modifier bits,
 117    handle them appropriately.  */
 118
 119 int
 120 char_string (unsigned int c, unsigned char *p)
 121 {
 122   int bytes;
 123
 124   if (c & CHAR_MODIFIER_MASK)
 125     {
 126       c = char_resolve_modifier_mask (c);
 127       /* If C still has any modifier bits, just ignore it.  */
 128       c &= ~CHAR_MODIFIER_MASK;
 129     }
 130
 131   MAYBE_UNIFY_CHAR (c);
 132
 133   if (c <= MAX_3_BYTE_CHAR)
 134     {
 135       bytes = CHAR_STRING (c, p);
 136     }
 137   else if (c <= MAX_4_BYTE_CHAR)
 138     {
 139       p[0] = (0xF0 | (c >> 18));
 140       p[1] = (0x80 | ((c >> 12) & 0x3F));
 141       p[2] = (0x80 | ((c >> 6) & 0x3F));
 142       p[3] = (0x80 | (c & 0x3F));
 143       bytes = 4;
 144     }
 145   else if (c <= MAX_5_BYTE_CHAR)
 146     {
 147       p[0] = 0xF8;
 148       p[1] = (0x80 | ((c >> 18) & 0x0F));
 149       p[2] = (0x80 | ((c >> 12) & 0x3F));
 150       p[3] = (0x80 | ((c >> 6) & 0x3F));
 151       p[4] = (0x80 | (c & 0x3F));
 152       bytes = 5;
 153     }
 154   else if (c <= MAX_CHAR)
 155     {
 156       c = CHAR_TO_BYTE8 (c);
 157       bytes = BYTE8_STRING (c, p);
 158     }
 159   else
 160     error ("Invalid character: %x", c);
 161
 162   return bytes;
 163 }
 164
 165
 166 /* Return a character whose multibyte form is at P.  If LEN is not
 167    NULL, it must be a pointer to integer.  In that case, set *LEN to
 168    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 169    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 170    the ending address (i.e., the starting address of the next
 171    character) of the multibyte form.  */
 172
 173 int
 174 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If no translation is
 211    found in TABLE, return the untranslated character.  If TABLE is a list,
 212    elements are char tables.  In that case, recursively translate C by all the
 213    tables in the list.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.
 262 usage: (characterp OBJECT)  */)
 263   (Lisp_Object object, Lisp_Object ignore)
 264 {
 265   return (CHARACTERP (object) ? Qt : Qnil);
 266 }
 267
 268 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 269        doc: /* Return the character of the maximum code.  */)
 270   (void)
 271 {
 272   return make_number (MAX_CHAR);
 273 }
 274
 275 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 276        Sunibyte_char_to_multibyte, 1, 1, 0,
 277        doc: /* Convert the byte CH to multibyte character.  */)
 278   (Lisp_Object ch)
 279 {
 280   int c;
 281
 282   CHECK_CHARACTER (ch);
 283   c = XFASTINT (ch);
 284   if (c >= 0x100)
 285     error ("Not a unibyte character: %d", c);
 286   MAKE_CHAR_MULTIBYTE (c);
 287   return make_number (c);
 288 }
 289
 290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 291        Smultibyte_char_to_unibyte, 1, 1, 0,
 292        doc: /* Convert the multibyte character CH to a byte.
 293 If the multibyte character does not represent a byte, return -1.  */)
 294   (Lisp_Object ch)
 295 {
 296   int cm;
 297
 298   CHECK_CHARACTER (ch);
 299   cm = XFASTINT (ch);
 300   if (cm < 256)
 301     /* Can't distinguish a byte read from a unibyte buffer from
 302        a latin1 char, so let's let it slide.  */
 303     return ch;
 304   else
 305     {
 306       int cu = CHAR_TO_BYTE_SAFE (cm);
 307       return make_number (cu);
 308     }
 309 }
 310
 311 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 312        doc: /* Return width of CHAR when displayed in the current buffer.
 313 The width is measured by how many columns it occupies on the screen.
 314 Tab is taken to occupy `tab-width' columns.
 315 usage: (char-width CHAR)  */)
 316   (Lisp_Object ch)
 317 {
 318   Lisp_Object disp;
 319   int c, width;
 320   struct Lisp_Char_Table *dp = buffer_display_table ();
 321
 322   CHECK_CHARACTER (ch);
 323   c = XINT (ch);
 324
 325   /* Get the way the display table would display it.  */
 326   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 327
 328   if (VECTORP (disp))
 329     width = ASIZE (disp);
 330   else
 331     width = CHAR_WIDTH (c);
 332
 333   return make_number (width);
 334 }
 335
 336 /* Return width of string STR of length LEN when displayed in the
 337    current buffer.  The width is measured by how many columns it
 338    occupies on the screen.  If PRECISION > 0, return the width of
 339    longest substring that doesn't exceed PRECISION, and set number of
 340    characters and bytes of the substring in *NCHARS and *NBYTES
 341    respectively.  */
 342
 343 EMACS_INT
 344 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 345                 EMACS_INT *nchars, EMACS_INT *nbytes)
 346 {
 347   EMACS_INT i = 0, i_byte = 0;
 348   EMACS_INT width = 0;
 349   struct Lisp_Char_Table *dp = buffer_display_table ();
 350
 351   while (i_byte < len)
 352     {
 353       int bytes, thiswidth;
 354       Lisp_Object val;
 355       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 356
 357       if (dp)
 358         {
 359           val = DISP_CHAR_VECTOR (dp, c);
 360           if (VECTORP (val))
 361             thiswidth = ASIZE (val);
 362           else
 363             thiswidth = CHAR_WIDTH (c);
 364         }
 365       else
 366         {
 367           thiswidth = CHAR_WIDTH (c);
 368         }
 369
 370       if (precision > 0
 371           && (width + thiswidth > precision))
 372         {
 373           *nchars = i;
 374           *nbytes = i_byte;
 375           return width;
 376         }
 377       i++;
 378       i_byte += bytes;
 379       width += thiswidth;
 380   }
 381
 382   if (precision > 0)
 383     {
 384       *nchars = i;
 385       *nbytes = i_byte;
 386     }
 387
 388   return width;
 389 }
 390
 391 /* Return width of string STR of length LEN when displayed in the
 392    current buffer.  The width is measured by how many columns it
 393    occupies on the screen.  */
 394
 395 EMACS_INT
 396 strwidth (const char *str, EMACS_INT len)
 397 {
 398   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 399 }
 400
 401 /* Return width of Lisp string STRING when displayed in the current
 402    buffer.  The width is measured by how many columns it occupies on
 403    the screen while paying attention to compositions.  If PRECISION >
 404    0, return the width of longest substring that doesn't exceed
 405    PRECISION, and set number of characters and bytes of the substring
 406    in *NCHARS and *NBYTES respectively.  */
 407
 408 EMACS_INT
 409 lisp_string_width (Lisp_Object string, EMACS_INT precision,
 410                    EMACS_INT *nchars, EMACS_INT *nbytes)
 411 {
 412   EMACS_INT len = SCHARS (string);
 413   /* This set multibyte to 0 even if STRING is multibyte when it
 414      contains only ascii and eight-bit-graphic, but that's
 415      intentional.  */
 416   int multibyte = len < SBYTES (string);
 417   unsigned char *str = SDATA (string);
 418   EMACS_INT i = 0, i_byte = 0;
 419   EMACS_INT width = 0;
 420   struct Lisp_Char_Table *dp = buffer_display_table ();
 421
 422   while (i < len)
 423     {
 424       EMACS_INT chars, bytes, thiswidth;
 425       Lisp_Object val;
 426       int cmp_id;
 427       EMACS_INT ignore, end;
 428
 429       if (find_composition (i, -1, &ignore, &end, &val, string)
 430           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 431               >= 0))
 432         {
 433           thiswidth = composition_table[cmp_id]->width;
 434           chars = end - i;
 435           bytes = string_char_to_byte (string, end) - i_byte;
 436         }
 437       else
 438         {
 439           int c;
 440
 441           if (multibyte)
 442             {
 443               int cbytes;
 444               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 445               bytes = cbytes;
 446             }
 447           else
 448             c = str[i_byte], bytes = 1;
 449           chars = 1;
 450           if (dp)
 451             {
 452               val = DISP_CHAR_VECTOR (dp, c);
 453               if (VECTORP (val))
 454                 thiswidth = ASIZE (val);
 455               else
 456                 thiswidth = CHAR_WIDTH (c);
 457             }
 458           else
 459             {
 460               thiswidth = CHAR_WIDTH (c);
 461             }
 462         }
 463
 464       if (precision <= 0)
 465         {
 466 #ifdef emacs
 467           if (INT_ADD_OVERFLOW (width, thiswidth))
 468             string_overflow ();
 469 #endif
 470         }
 471       else if (precision - width < thiswidth)
 472         {
 473           *nchars = i;
 474           *nbytes = i_byte;
 475           return width;
 476         }
 477       i += chars;
 478       i_byte += bytes;
 479       width += thiswidth;
 480     }
 481
 482   if (precision > 0)
 483     {
 484       *nchars = i;
 485       *nbytes = i_byte;
 486     }
 487
 488   return width;
 489 }
 490
 491 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 492        doc: /* Return width of STRING when displayed in the current buffer.
 493 Width is measured by how many columns it occupies on the screen.
 494 When calculating width of a multibyte character in STRING,
 495 only the base leading-code is considered; the validity of
 496 the following bytes is not checked.  Tabs in STRING are always
 497 taken to occupy `tab-width' columns.
 498 usage: (string-width STRING)  */)
 499   (Lisp_Object str)
 500 {
 501   Lisp_Object val;
 502
 503   CHECK_STRING (str);
 504   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 505   return val;
 506 }
 507
 508 /* Return the number of characters in the NBYTES bytes at PTR.
 509    This works by looking at the contents and checking for multibyte
 510    sequences while assuming that there's no invalid sequence.
 511    However, if the current buffer has enable-multibyte-characters =
 512    nil, we treat each byte as a character.  */
 513
 514 EMACS_INT
 515 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 516 {
 517   /* current_buffer is null at early stages of Emacs initialization.  */
 518   if (current_buffer == 0
 519       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 520     return nbytes;
 521
 522   return multibyte_chars_in_text (ptr, nbytes);
 523 }
 524
 525 /* Return the number of characters in the NBYTES bytes at PTR.
 526    This works by looking at the contents and checking for multibyte
 527    sequences while assuming that there's no invalid sequence.  It
 528    ignores enable-multibyte-characters.  */
 529
 530 EMACS_INT
 531 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 532 {
 533   const unsigned char *endp = ptr + nbytes;
 534   EMACS_INT chars = 0;
 535
 536   while (ptr < endp)
 537     {
 538       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 539
 540       if (len == 0)
 541         abort ();
 542       ptr += len;
 543       chars++;
 544     }
 545
 546   return chars;
 547 }
 548
 549 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 550    characters and bytes in it, and store them in *NCHARS and *NBYTES
 551    respectively.  On counting bytes, pay attention to that 8-bit
 552    characters not constructing a valid multibyte sequence are
 553    represented by 2-byte in a multibyte text.  */
 554
 555 void
 556 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 557                         EMACS_INT *nchars, EMACS_INT *nbytes)
 558 {
 559   const unsigned char *endp = str + len;
 560   EMACS_INT n, chars = 0, bytes = 0;
 561
 562   if (len >= MAX_MULTIBYTE_LENGTH)
 563     {
 564       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 565       while (str < adjusted_endp)
 566         {
 567           if (! CHAR_BYTE8_HEAD_P (*str)
 568               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 569             str += n, bytes += n;
 570           else
 571             str++, bytes += 2;
 572           chars++;
 573         }
 574     }
 575   while (str < endp)
 576     {
 577       if (! CHAR_BYTE8_HEAD_P (*str)
 578           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 579         str += n, bytes += n;
 580       else
 581         str++, bytes += 2;
 582       chars++;
 583     }
 584
 585   *nchars = chars;
 586   *nbytes = bytes;
 587   return;
 588 }
 589
 590 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 591    It actually converts only such 8-bit characters that don't contruct
 592    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 593    NCHARS is nonzero, set *NCHARS to the number of characters in the
 594    text.  It is assured that we can use LEN bytes at STR as a work
 595    area and that is enough.  Return the number of bytes of the
 596    resulting text.  */
 597
 598 EMACS_INT
 599 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 600                   EMACS_INT *nchars)
 601 {
 602   unsigned char *p = str, *endp = str + nbytes;
 603   unsigned char *to;
 604   EMACS_INT chars = 0;
 605   int n;
 606
 607   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 608     {
 609       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 610       while (p < adjusted_endp
 611              && ! CHAR_BYTE8_HEAD_P (*p)
 612              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 613         p += n, chars++;
 614     }
 615   while (p < endp
 616          && ! CHAR_BYTE8_HEAD_P (*p)
 617          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 618     p += n, chars++;
 619   if (nchars)
 620     *nchars = chars;
 621   if (p == endp)
 622     return nbytes;
 623
 624   to = p;
 625   nbytes = endp - p;
 626   endp = str + len;
 627   memmove (endp - nbytes, p, nbytes);
 628   p = endp - nbytes;
 629
 630   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 631     {
 632       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 633       while (p < adjusted_endp)
 634         {
 635           if (! CHAR_BYTE8_HEAD_P (*p)
 636               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 637             {
 638               while (n--)
 639                 *to++ = *p++;
 640             }
 641           else
 642             {
 643               int c = *p++;
 644               c = BYTE8_TO_CHAR (c);
 645               to += CHAR_STRING (c, to);
 646             }
 647         }
 648       chars++;
 649     }
 650   while (p < endp)
 651     {
 652       if (! CHAR_BYTE8_HEAD_P (*p)
 653           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 654         {
 655           while (n--)
 656             *to++ = *p++;
 657         }
 658       else
 659         {
 660           int c = *p++;
 661           c = BYTE8_TO_CHAR (c);
 662           to += CHAR_STRING (c, to);
 663         }
 664       chars++;
 665     }
 666   if (nchars)
 667     *nchars = chars;
 668   return (to - str);
 669 }
 670
 671 /* Parse unibyte string at STR of LEN bytes, and return the number of
 672    bytes it may ocupy when converted to multibyte string by
 673    `str_to_multibyte'.  */
 674
 675 EMACS_INT
 676 count_size_as_multibyte (const unsigned char *str, EMACS_INT len)
 677 {
 678   const unsigned char *endp = str + len;
 679   EMACS_INT bytes;
 680
 681   for (bytes = 0; str < endp; str++)
 682     {
 683       int n = *str < 0x80 ? 1 : 2;
 684       if (INT_ADD_OVERFLOW (bytes, n))
 685         string_overflow ();
 686       bytes += n;
 687     }
 688   return bytes;
 689 }
 690
 691
 692 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 693    that contains the same single-byte characters.  It actually
 694    converts all 8-bit characters to multibyte forms.  It is assured
 695    that we can use LEN bytes at STR as a work area and that is
 696    enough.  */
 697
 698 EMACS_INT
 699 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 700 {
 701   unsigned char *p = str, *endp = str + bytes;
 702   unsigned char *to;
 703
 704   while (p < endp && *p < 0x80) p++;
 705   if (p == endp)
 706     return bytes;
 707   to = p;
 708   bytes = endp - p;
 709   endp = str + len;
 710   memmove (endp - bytes, p, bytes);
 711   p = endp - bytes;
 712   while (p < endp)
 713     {
 714       int c = *p++;
 715
 716       if (c >= 0x80)
 717         c = BYTE8_TO_CHAR (c);
 718       to += CHAR_STRING (c, to);
 719     }
 720   return (to - str);
 721 }
 722
 723 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 724    actually converts characters in the range 0x80..0xFF to
 725    unibyte.  */
 726
 727 EMACS_INT
 728 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 729 {
 730   const unsigned char *p = str, *endp = str + bytes;
 731   unsigned char *to;
 732   int c, len;
 733
 734   while (p < endp)
 735     {
 736       c = *p;
 737       len = BYTES_BY_CHAR_HEAD (c);
 738       if (CHAR_BYTE8_HEAD_P (c))
 739         break;
 740       p += len;
 741     }
 742   to = str + (p - str);
 743   while (p < endp)
 744     {
 745       c = *p;
 746       len = BYTES_BY_CHAR_HEAD (c);
 747       if (CHAR_BYTE8_HEAD_P (c))
 748         {
 749           c = STRING_CHAR_ADVANCE (p);
 750           *to++ = CHAR_TO_BYTE8 (c);
 751         }
 752       else
 753         {
 754           while (len--) *to++ = *p++;
 755         }
 756     }
 757   return (to - str);
 758 }
 759
 760 /* Convert eight-bit chars in SRC (in multibyte form) to the
 761    corresponding byte and store in DST.  CHARS is the number of
 762    characters in SRC.  The value is the number of bytes stored in DST.
 763    Usually, the value is the same as CHARS, but is less than it if SRC
 764    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 765    is nonzero, a Latin-1 character is accepted and converted to a byte
 766    of that character code.
 767    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 768
 769 EMACS_INT
 770 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 771 {
 772   EMACS_INT i;
 773
 774   for (i = 0; i < chars; i++)
 775     {
 776       int c = STRING_CHAR_ADVANCE (src);
 777
 778       if (CHAR_BYTE8_P (c))
 779         c = CHAR_TO_BYTE8 (c);
 780       else if (! ASCII_CHAR_P (c)
 781                && (! accept_latin_1 || c >= 0x100))
 782         return i;
 783       *dst++ = c;
 784     }
 785   return i;
 786 }
 787
 788
 789 static EMACS_INT
 790 string_count_byte8 (Lisp_Object string)
 791 {
 792   int multibyte = STRING_MULTIBYTE (string);
 793   EMACS_INT nbytes = SBYTES (string);
 794   unsigned char *p = SDATA (string);
 795   unsigned char *pend = p + nbytes;
 796   EMACS_INT count = 0;
 797   int c, len;
 798
 799   if (multibyte)
 800     while (p < pend)
 801       {
 802         c = *p;
 803         len = BYTES_BY_CHAR_HEAD (c);
 804
 805         if (CHAR_BYTE8_HEAD_P (c))
 806           count++;
 807         p += len;
 808       }
 809   else
 810     while (p < pend)
 811       {
 812         if (*p++ >= 0x80)
 813           count++;
 814       }
 815   return count;
 816 }
 817
 818
 819 Lisp_Object
 820 string_escape_byte8 (Lisp_Object string)
 821 {
 822   EMACS_INT nchars = SCHARS (string);
 823   EMACS_INT nbytes = SBYTES (string);
 824   int multibyte = STRING_MULTIBYTE (string);
 825   EMACS_INT byte8_count;
 826   const unsigned char *src, *src_end;
 827   unsigned char *dst;
 828   Lisp_Object val;
 829   int c, len;
 830
 831   if (multibyte && nchars == nbytes)
 832     return string;
 833
 834   byte8_count = string_count_byte8 (string);
 835
 836   if (byte8_count == 0)
 837     return string;
 838
 839   if (multibyte)
 840     {
 841       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 842           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 843         string_overflow ();
 844
 845       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 846       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 847                                           nbytes + byte8_count * 2);
 848     }
 849   else
 850     {
 851       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 852         string_overflow ();
 853
 854       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 855       val = make_uninit_string (nbytes + byte8_count * 3);
 856     }
 857
 858   src = SDATA (string);
 859   src_end = src + nbytes;
 860   dst = SDATA (val);
 861   if (multibyte)
 862     while (src < src_end)
 863       {
 864         c = *src;
 865         len = BYTES_BY_CHAR_HEAD (c);
 866
 867         if (CHAR_BYTE8_HEAD_P (c))
 868           {
 869             c = STRING_CHAR_ADVANCE (src);
 870             c = CHAR_TO_BYTE8 (c);
 871             sprintf ((char *) dst, "\\%03o", c);
 872             dst += 4;
 873           }
 874         else
 875           while (len--) *dst++ = *src++;
 876       }
 877   else
 878     while (src < src_end)
 879       {
 880         c = *src++;
 881         if (c >= 0x80)
 882           {
 883             sprintf ((char *) dst, "\\%03o", c);
 884             dst += 4;
 885           }
 886         else
 887           *dst++ = c;
 888       }
 889   return val;
 890 }
 891
 892 \f
 893 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 894        doc: /*
 895 Concatenate all the argument characters and make the result a string.
 896 usage: (string &rest CHARACTERS)  */)
 897   (ptrdiff_t n, Lisp_Object *args)
 898 {
 899   ptrdiff_t i;
 900   int c;
 901   unsigned char *buf, *p;
 902   Lisp_Object str;
 903   USE_SAFE_ALLOCA;
 904
 905   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 906   p = buf;
 907
 908   for (i = 0; i < n; i++)
 909     {
 910       CHECK_CHARACTER (args[i]);
 911       c = XINT (args[i]);
 912       p += CHAR_STRING (c, p);
 913     }
 914
 915   str = make_string_from_bytes ((char *) buf, n, p - buf);
 916   SAFE_FREE ();
 917   return str;
 918 }
 919
 920 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 921        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 922 usage: (unibyte-string &rest BYTES)  */)
 923   (ptrdiff_t n, Lisp_Object *args)
 924 {
 925   ptrdiff_t i;
 926   int c;
 927   unsigned char *buf, *p;
 928   Lisp_Object str;
 929   USE_SAFE_ALLOCA;
 930
 931   SAFE_ALLOCA (buf, unsigned char *, n);
 932   p = buf;
 933
 934   for (i = 0; i < n; i++)
 935     {
 936       CHECK_NATNUM (args[i]);
 937       c = XFASTINT (args[i]);
 938       if (c >= 256)
 939         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 940       *p++ = c;
 941     }
 942
 943   str = make_string_from_bytes ((char *) buf, n, p - buf);
 944   SAFE_FREE ();
 945   return str;
 946 }
 947
 948 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 949        Schar_resolve_modifiers, 1, 1, 0,
 950        doc: /* Resolve modifiers in the character CHAR.
 951 The value is a character with modifiers resolved into the character
 952 code.  Unresolved modifiers are kept in the value.
 953 usage: (char-resolve-modifiers CHAR)  */)
 954   (Lisp_Object character)
 955 {
 956   int c;
 957
 958   CHECK_NUMBER (character);
 959   c = XINT (character);
 960   return make_number (char_resolve_modifier_mask (c));
 961 }
 962
 963 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 964        doc: /* Return a byte value of a character at point.
 965 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 966 a byte value.
 967 Optional 2nd arg STRING, if non-nil, is a string of which first
 968 character is a target to get a byte value.  In this case, POSITION, if
 969 non-nil, is an index of a target character in the string.
 970
 971 If the current buffer (or STRING) is multibyte, and the target
 972 character is not ASCII nor 8-bit character, an error is signalled.  */)
 973   (Lisp_Object position, Lisp_Object string)
 974 {
 975   int c;
 976   EMACS_INT pos;
 977   unsigned char *p;
 978
 979   if (NILP (string))
 980     {
 981       if (NILP (position))
 982         {
 983           p = PT_ADDR;
 984         }
 985       else
 986         {
 987           CHECK_NUMBER_COERCE_MARKER (position);
 988           if (XINT (position) < BEGV || XINT (position) >= ZV)
 989             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 990           pos = XFASTINT (position);
 991           p = CHAR_POS_ADDR (pos);
 992         }
 993       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 994         return make_number (*p);
 995     }
 996   else
 997     {
 998       CHECK_STRING (string);
 999       if (NILP (position))
1000         {
1001           p = SDATA (string);
1002         }
1003       else
1004         {
1005           CHECK_NATNUM (position);
1006           if (XINT (position) >= SCHARS (string))
1007             args_out_of_range (string, position);
1008           pos = XFASTINT (position);
1009           p = SDATA (string) + string_char_to_byte (string, pos);
1010         }
1011       if (! STRING_MULTIBYTE (string))
1012         return make_number (*p);
1013     }
1014   c = STRING_CHAR (p);
1015   if (CHAR_BYTE8_P (c))
1016     c = CHAR_TO_BYTE8 (c);
1017   else if (! ASCII_CHAR_P (c))
1018     error ("Not an ASCII nor an 8-bit character: %d", c);
1019   return make_number (c);
1020 }
1021
1022
1023 void
1024 init_character_once (void)
1025 {
1026 }
1027
1028 #ifdef emacs
1029
1030 void
1031 syms_of_character (void)
1032 {
1033   DEFSYM (Qcharacterp, "characterp");
1034   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1035
1036   staticpro (&Vchar_unify_table);
1037   Vchar_unify_table = Qnil;
1038
1039   defsubr (&Smax_char);
1040   defsubr (&Scharacterp);
1041   defsubr (&Sunibyte_char_to_multibyte);
1042   defsubr (&Smultibyte_char_to_unibyte);
1043   defsubr (&Schar_width);
1044   defsubr (&Sstring_width);
1045   defsubr (&Sstring);
1046   defsubr (&Sunibyte_string);
1047   defsubr (&Schar_resolve_modifiers);
1048   defsubr (&Sget_byte);
1049
1050   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1051                doc: /*
1052 Vector recording all translation tables ever defined.
1053 Each element is a pair (SYMBOL . TABLE) relating the table to the
1054 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1055   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1056
1057   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1058                doc: /*
1059 A char-table for characters which invoke auto-filling.
1060 Such characters have value t in this table.  */);
1061   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1062   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1063   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1064
1065   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1066                doc: /*
1067 A char-table for width (columns) of each character.  */);
1068   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1069   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1070   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1071                         make_number (4));
1072
1073   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1074                doc: /* A char-table for each printable character.  */);
1075   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1076   Fset_char_table_range (Vprintable_chars,
1077                          Fcons (make_number (32), make_number (126)), Qt);
1078   Fset_char_table_range (Vprintable_chars,
1079                          Fcons (make_number (160),
1080                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1081
1082   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1083                doc: /* Char table of script symbols.
1084 It has one extra slot whose value is a list of script symbols.  */);
1085
1086   /* Intern this now in case it isn't already done.
1087      Setting this variable twice is harmless.
1088      But don't staticpro it here--that is done in alloc.c.  */
1089   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1090   DEFSYM (Qchar_script_table, "char-script-table");
1091   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1092   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1093
1094   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1095                doc: /* Alist of scripts vs the representative characters.
1096 Each element is a cons (SCRIPT . CHARS).
1097 SCRIPT is a symbol representing a script or a subgroup of a script.
1098 CHARS is a list or a vector of characters.
1099 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1100 If it is a vector, one of the characters in the vector is necessary.
1101 This variable is used to find a font for a specific script.  */);
1102   Vscript_representative_chars = Qnil;
1103
1104   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1105                doc: /* Char table of Unicode's "General Category".
1106 All Unicode characters have one of the following values (symbol):
1107   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1108   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1109 See The Unicode Standard for the meaning of those values.  */);
1110   /* The correct char-table is setup in characters.el.  */
1111   Vunicode_category_table = Qnil;
1112 }
1113
1114 #endif /* emacs */