src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2013 Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <intprops.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 static Lisp_Object Qauto_fill_chars;
  54
  55 /* Char-table of information about which character to unify to which
  56    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  57 Lisp_Object Vchar_unify_table;
  58
  59 static Lisp_Object Qchar_script_table;
  60
  61 \f
  62
  63 /* If character code C has modifier masks, reflect them to the
  64    character code if possible.  Return the resulting code.  */
  65
  66 EMACS_INT
  67 char_resolve_modifier_mask (EMACS_INT c)
  68 {
  69   /* A non-ASCII character can't reflect modifier bits to the code.  */
  70   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  71     return c;
  72
  73   /* For Meta, Shift, and Control modifiers, we need special care.  */
  74   if (c & CHAR_SHIFT)
  75     {
  76       /* Shift modifier is valid only with [A-Za-z].  */
  77       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  78         c &= ~CHAR_SHIFT;
  79       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  80         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  81       /* Shift modifier for control characters and SPC is ignored.  */
  82       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  83         c &= ~CHAR_SHIFT;
  84     }
  85   if (c & CHAR_CTL)
  86     {
  87       /* Simulate the code in lread.c.  */
  88       /* Allow `\C- ' and `\C-?'.  */
  89       if ((c & 0377) == ' ')
  90         c &= ~0177 & ~ CHAR_CTL;
  91       else if ((c & 0377) == '?')
  92         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  93       /* ASCII control chars are made from letters (both cases),
  94          as well as the non-letters within 0100...0137.  */
  95       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  96         c &= (037 | (~0177 & ~CHAR_CTL));
  97       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  98         c &= (037 | (~0177 & ~CHAR_CTL));
  99     }
 100 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 101   if (c & CHAR_META)
 102     {
 103       /* Move the meta bit to the right place for a string.  */
 104       c = (c & ~CHAR_META) | 0x80;
 105     }
 106 #endif
 107
 108   return c;
 109 }
 110
 111
 112 /* Store multibyte form of character C at P.  If C has modifier bits,
 113    handle them appropriately.  */
 114
 115 int
 116 char_string (unsigned int c, unsigned char *p)
 117 {
 118   int bytes;
 119
 120   if (c & CHAR_MODIFIER_MASK)
 121     {
 122       c = char_resolve_modifier_mask (c);
 123       /* If C still has any modifier bits, just ignore it.  */
 124       c &= ~CHAR_MODIFIER_MASK;
 125     }
 126
 127   if (c <= MAX_3_BYTE_CHAR)
 128     {
 129       bytes = CHAR_STRING (c, p);
 130     }
 131   else if (c <= MAX_4_BYTE_CHAR)
 132     {
 133       p[0] = (0xF0 | (c >> 18));
 134       p[1] = (0x80 | ((c >> 12) & 0x3F));
 135       p[2] = (0x80 | ((c >> 6) & 0x3F));
 136       p[3] = (0x80 | (c & 0x3F));
 137       bytes = 4;
 138     }
 139   else if (c <= MAX_5_BYTE_CHAR)
 140     {
 141       p[0] = 0xF8;
 142       p[1] = (0x80 | ((c >> 18) & 0x0F));
 143       p[2] = (0x80 | ((c >> 12) & 0x3F));
 144       p[3] = (0x80 | ((c >> 6) & 0x3F));
 145       p[4] = (0x80 | (c & 0x3F));
 146       bytes = 5;
 147     }
 148   else if (c <= MAX_CHAR)
 149     {
 150       c = CHAR_TO_BYTE8 (c);
 151       bytes = BYTE8_STRING (c, p);
 152     }
 153   else
 154     error ("Invalid character: %x", c);
 155
 156   return bytes;
 157 }
 158
 159
 160 /* Return a character whose multibyte form is at P.  If LEN is not
 161    NULL, it must be a pointer to integer.  In that case, set *LEN to
 162    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 163    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 164    the ending address (i.e., the starting address of the next
 165    character) of the multibyte form.  */
 166
 167 int
 168 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 169 {
 170   int c;
 171   const unsigned char *saved_p = p;
 172
 173   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 174     {
 175       /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
 176       c = STRING_CHAR_ADVANCE (p);
 177     }
 178   else if (! (*p & 0x08))
 179     {
 180       /* A 4-byte sequence of this form:
 181          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
 182       c = ((((p)[0] & 0x7) << 18)
 183            | (((p)[1] & 0x3F) << 12)
 184            | (((p)[2] & 0x3F) << 6)
 185            | ((p)[3] & 0x3F));
 186       p += 4;
 187     }
 188   else
 189     {
 190       /* A 5-byte sequence of this form:
 191
 192          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 193
 194          Note that the top 4 `x's are always 0, so shifting p[1] can
 195          never exceed the maximum valid character codepoint. */
 196       c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
 197            (((p)[1] & 0x3F) << 18)
 198            | (((p)[2] & 0x3F) << 12)
 199            | (((p)[3] & 0x3F) << 6)
 200            | ((p)[4] & 0x3F));
 201       p += 5;
 202     }
 203
 204   if (len)
 205     *len = p - saved_p;
 206   if (advanced)
 207     *advanced = p;
 208   return c;
 209 }
 210
 211
 212 /* Translate character C by translation table TABLE.  If no translation is
 213    found in TABLE, return the untranslated character.  If TABLE is a list,
 214    elements are char tables.  In that case, recursively translate C by all the
 215    tables in the list.  */
 216
 217 int
 218 translate_char (Lisp_Object table, int c)
 219 {
 220   if (CHAR_TABLE_P (table))
 221     {
 222       Lisp_Object ch;
 223
 224       ch = CHAR_TABLE_REF (table, c);
 225       if (CHARACTERP (ch))
 226         c = XINT (ch);
 227     }
 228   else
 229     {
 230       for (; CONSP (table); table = XCDR (table))
 231         c = translate_char (XCAR (table), c);
 232     }
 233   return c;
 234 }
 235
 236 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 237    them, return (C & 0xFF).  */
 238
 239 int
 240 multibyte_char_to_unibyte (int c)
 241 {
 242   if (c < 0x80)
 243     return c;
 244   if (CHAR_BYTE8_P (c))
 245     return CHAR_TO_BYTE8 (c);
 246   return (c & 0xFF);
 247 }
 248
 249 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 250    by charset_unibyte.  */
 251
 252 int
 253 multibyte_char_to_unibyte_safe (int c)
 254 {
 255   if (c < 0x80)
 256     return c;
 257   if (CHAR_BYTE8_P (c))
 258     return CHAR_TO_BYTE8 (c);
 259   return -1;
 260 }
 261
 262 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 263        doc: /* Return non-nil if OBJECT is a character.
 264 In Emacs Lisp, characters are represented by character codes, which
 265 are non-negative integers.  The function `max-char' returns the
 266 maximum character code.
 267 usage: (characterp OBJECT)  */)
 268   (Lisp_Object object, Lisp_Object ignore)
 269 {
 270   return (CHARACTERP (object) ? Qt : Qnil);
 271 }
 272
 273 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 274        doc: /* Return the character of the maximum code.  */)
 275   (void)
 276 {
 277   return make_number (MAX_CHAR);
 278 }
 279
 280 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 281        Sunibyte_char_to_multibyte, 1, 1, 0,
 282        doc: /* Convert the byte CH to multibyte character.  */)
 283   (Lisp_Object ch)
 284 {
 285   int c;
 286
 287   CHECK_CHARACTER (ch);
 288   c = XFASTINT (ch);
 289   if (c >= 0x100)
 290     error ("Not a unibyte character: %d", c);
 291   MAKE_CHAR_MULTIBYTE (c);
 292   return make_number (c);
 293 }
 294
 295 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 296        Smultibyte_char_to_unibyte, 1, 1, 0,
 297        doc: /* Convert the multibyte character CH to a byte.
 298 If the multibyte character does not represent a byte, return -1.  */)
 299   (Lisp_Object ch)
 300 {
 301   int cm;
 302
 303   CHECK_CHARACTER (ch);
 304   cm = XFASTINT (ch);
 305   if (cm < 256)
 306     /* Can't distinguish a byte read from a unibyte buffer from
 307        a latin1 char, so let's let it slide.  */
 308     return ch;
 309   else
 310     {
 311       int cu = CHAR_TO_BYTE_SAFE (cm);
 312       return make_number (cu);
 313     }
 314 }
 315
 316
 317 /* Return width (columns) of C considering the buffer display table DP. */
 318
 319 static ptrdiff_t
 320 char_width (int c, struct Lisp_Char_Table *dp)
 321 {
 322   ptrdiff_t width = CHAR_WIDTH (c);
 323
 324   if (dp)
 325     {
 326       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 327       int i;
 328
 329       if (VECTORP (disp))
 330         for (i = 0, width = 0; i < ASIZE (disp); i++)
 331           {
 332             ch = AREF (disp, i);
 333             if (CHARACTERP (ch))
 334               {
 335                 int w = CHAR_WIDTH (XFASTINT (ch));
 336                 if (INT_ADD_OVERFLOW (width, w))
 337                   string_overflow ();
 338                 width += w;
 339               }
 340           }
 341     }
 342   return width;
 343 }
 344
 345
 346 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 347        doc: /* Return width of CHAR when displayed in the current buffer.
 348 The width is measured by how many columns it occupies on the screen.
 349 Tab is taken to occupy `tab-width' columns.
 350 usage: (char-width CHAR)  */)
 351   (Lisp_Object ch)
 352 {
 353   int c;
 354   ptrdiff_t width;
 355
 356   CHECK_CHARACTER (ch);
 357   c = XINT (ch);
 358   width = char_width (c, buffer_display_table ());
 359   return make_number (width);
 360 }
 361
 362 /* Return width of string STR of length LEN when displayed in the
 363    current buffer.  The width is measured by how many columns it
 364    occupies on the screen.  If PRECISION > 0, return the width of
 365    longest substring that doesn't exceed PRECISION, and set number of
 366    characters and bytes of the substring in *NCHARS and *NBYTES
 367    respectively.  */
 368
 369 ptrdiff_t
 370 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 371                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 372 {
 373   ptrdiff_t i = 0, i_byte = 0;
 374   ptrdiff_t width = 0;
 375   struct Lisp_Char_Table *dp = buffer_display_table ();
 376
 377   while (i_byte < len)
 378     {
 379       int bytes;
 380       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 381       ptrdiff_t thiswidth = char_width (c, dp);
 382
 383       if (precision <= 0)
 384         {
 385           if (INT_ADD_OVERFLOW (width, thiswidth))
 386             string_overflow ();
 387         }
 388       else if (precision - width < thiswidth)
 389         {
 390           *nchars = i;
 391           *nbytes = i_byte;
 392           return width;
 393         }
 394       i++;
 395       i_byte += bytes;
 396       width += thiswidth;
 397   }
 398
 399   if (precision > 0)
 400     {
 401       *nchars = i;
 402       *nbytes = i_byte;
 403     }
 404
 405   return width;
 406 }
 407
 408 /* Return width of string STR of length LEN when displayed in the
 409    current buffer.  The width is measured by how many columns it
 410    occupies on the screen.  */
 411
 412 ptrdiff_t
 413 strwidth (const char *str, ptrdiff_t len)
 414 {
 415   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 416 }
 417
 418 /* Return width of Lisp string STRING when displayed in the current
 419    buffer.  The width is measured by how many columns it occupies on
 420    the screen while paying attention to compositions.  If PRECISION >
 421    0, return the width of longest substring that doesn't exceed
 422    PRECISION, and set number of characters and bytes of the substring
 423    in *NCHARS and *NBYTES respectively.  */
 424
 425 ptrdiff_t
 426 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 427                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 428 {
 429   ptrdiff_t len = SCHARS (string);
 430   /* This set multibyte to 0 even if STRING is multibyte when it
 431      contains only ascii and eight-bit-graphic, but that's
 432      intentional.  */
 433   bool multibyte = len < SBYTES (string);
 434   unsigned char *str = SDATA (string);
 435   ptrdiff_t i = 0, i_byte = 0;
 436   ptrdiff_t width = 0;
 437   struct Lisp_Char_Table *dp = buffer_display_table ();
 438
 439   while (i < len)
 440     {
 441       ptrdiff_t chars, bytes, thiswidth;
 442       Lisp_Object val;
 443       ptrdiff_t cmp_id;
 444       ptrdiff_t ignore, end;
 445
 446       if (find_composition (i, -1, &ignore, &end, &val, string)
 447           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 448               >= 0))
 449         {
 450           thiswidth = composition_table[cmp_id]->width;
 451           chars = end - i;
 452           bytes = string_char_to_byte (string, end) - i_byte;
 453         }
 454       else
 455         {
 456           int c;
 457
 458           if (multibyte)
 459             {
 460               int cbytes;
 461               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 462               bytes = cbytes;
 463             }
 464           else
 465             c = str[i_byte], bytes = 1;
 466           chars = 1;
 467           thiswidth = char_width (c, dp);
 468         }
 469
 470       if (precision <= 0)
 471         {
 472 #ifdef emacs
 473           if (INT_ADD_OVERFLOW (width, thiswidth))
 474             string_overflow ();
 475 #endif
 476         }
 477       else if (precision - width < thiswidth)
 478         {
 479           *nchars = i;
 480           *nbytes = i_byte;
 481           return width;
 482         }
 483       i += chars;
 484       i_byte += bytes;
 485       width += thiswidth;
 486     }
 487
 488   if (precision > 0)
 489     {
 490       *nchars = i;
 491       *nbytes = i_byte;
 492     }
 493
 494   return width;
 495 }
 496
 497 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 498        doc: /* Return width of STRING when displayed in the current buffer.
 499 Width is measured by how many columns it occupies on the screen.
 500 When calculating width of a multibyte character in STRING,
 501 only the base leading-code is considered; the validity of
 502 the following bytes is not checked.  Tabs in STRING are always
 503 taken to occupy `tab-width' columns.
 504 usage: (string-width STRING)  */)
 505   (Lisp_Object str)
 506 {
 507   Lisp_Object val;
 508
 509   CHECK_STRING (str);
 510   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 511   return val;
 512 }
 513
 514 /* Return the number of characters in the NBYTES bytes at PTR.
 515    This works by looking at the contents and checking for multibyte
 516    sequences while assuming that there's no invalid sequence.
 517    However, if the current buffer has enable-multibyte-characters =
 518    nil, we treat each byte as a character.  */
 519
 520 ptrdiff_t
 521 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 522 {
 523   /* current_buffer is null at early stages of Emacs initialization.  */
 524   if (current_buffer == 0
 525       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 526     return nbytes;
 527
 528   return multibyte_chars_in_text (ptr, nbytes);
 529 }
 530
 531 /* Return the number of characters in the NBYTES bytes at PTR.
 532    This works by looking at the contents and checking for multibyte
 533    sequences while assuming that there's no invalid sequence.  It
 534    ignores enable-multibyte-characters.  */
 535
 536 ptrdiff_t
 537 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 538 {
 539   const unsigned char *endp = ptr + nbytes;
 540   ptrdiff_t chars = 0;
 541
 542   while (ptr < endp)
 543     {
 544       int len = MULTIBYTE_LENGTH (ptr, endp);
 545
 546       if (len == 0)
 547         emacs_abort ();
 548       ptr += len;
 549       chars++;
 550     }
 551
 552   return chars;
 553 }
 554
 555 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 556    characters and bytes in it, and store them in *NCHARS and *NBYTES
 557    respectively.  On counting bytes, pay attention to that 8-bit
 558    characters not constructing a valid multibyte sequence are
 559    represented by 2-byte in a multibyte text.  */
 560
 561 void
 562 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 563                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 564 {
 565   const unsigned char *endp = str + len;
 566   int n;
 567   ptrdiff_t chars = 0, bytes = 0;
 568
 569   if (len >= MAX_MULTIBYTE_LENGTH)
 570     {
 571       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 572       while (str < adjusted_endp)
 573         {
 574           if (! CHAR_BYTE8_HEAD_P (*str)
 575               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 576             str += n, bytes += n;
 577           else
 578             str++, bytes += 2;
 579           chars++;
 580         }
 581     }
 582   while (str < endp)
 583     {
 584       if (! CHAR_BYTE8_HEAD_P (*str)
 585           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 586         str += n, bytes += n;
 587       else
 588         str++, bytes += 2;
 589       chars++;
 590     }
 591
 592   *nchars = chars;
 593   *nbytes = bytes;
 594   return;
 595 }
 596
 597 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 598    It actually converts only such 8-bit characters that don't construct
 599    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 600    NCHARS is nonzero, set *NCHARS to the number of characters in the
 601    text.  It is assured that we can use LEN bytes at STR as a work
 602    area and that is enough.  Return the number of bytes of the
 603    resulting text.  */
 604
 605 ptrdiff_t
 606 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 607                   ptrdiff_t *nchars)
 608 {
 609   unsigned char *p = str, *endp = str + nbytes;
 610   unsigned char *to;
 611   ptrdiff_t chars = 0;
 612   int n;
 613
 614   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 615     {
 616       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 617       while (p < adjusted_endp
 618              && ! CHAR_BYTE8_HEAD_P (*p)
 619              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 620         p += n, chars++;
 621     }
 622   while (p < endp
 623          && ! CHAR_BYTE8_HEAD_P (*p)
 624          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 625     p += n, chars++;
 626   if (nchars)
 627     *nchars = chars;
 628   if (p == endp)
 629     return nbytes;
 630
 631   to = p;
 632   nbytes = endp - p;
 633   endp = str + len;
 634   memmove (endp - nbytes, p, nbytes);
 635   p = endp - nbytes;
 636
 637   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 638     {
 639       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 640       while (p < adjusted_endp)
 641         {
 642           if (! CHAR_BYTE8_HEAD_P (*p)
 643               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 644             {
 645               while (n--)
 646                 *to++ = *p++;
 647             }
 648           else
 649             {
 650               int c = *p++;
 651               c = BYTE8_TO_CHAR (c);
 652               to += CHAR_STRING (c, to);
 653             }
 654         }
 655       chars++;
 656     }
 657   while (p < endp)
 658     {
 659       if (! CHAR_BYTE8_HEAD_P (*p)
 660           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 661         {
 662           while (n--)
 663             *to++ = *p++;
 664         }
 665       else
 666         {
 667           int c = *p++;
 668           c = BYTE8_TO_CHAR (c);
 669           to += CHAR_STRING (c, to);
 670         }
 671       chars++;
 672     }
 673   if (nchars)
 674     *nchars = chars;
 675   return (to - str);
 676 }
 677
 678 /* Parse unibyte string at STR of LEN bytes, and return the number of
 679    bytes it may occupy when converted to multibyte string by
 680    `str_to_multibyte'.  */
 681
 682 ptrdiff_t
 683 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 684 {
 685   const unsigned char *endp = str + len;
 686   ptrdiff_t bytes;
 687
 688   for (bytes = 0; str < endp; str++)
 689     {
 690       int n = *str < 0x80 ? 1 : 2;
 691       if (INT_ADD_OVERFLOW (bytes, n))
 692         string_overflow ();
 693       bytes += n;
 694     }
 695   return bytes;
 696 }
 697
 698
 699 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 700    that contains the same single-byte characters.  It actually
 701    converts all 8-bit characters to multibyte forms.  It is assured
 702    that we can use LEN bytes at STR as a work area and that is
 703    enough.  */
 704
 705 ptrdiff_t
 706 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 707 {
 708   unsigned char *p = str, *endp = str + bytes;
 709   unsigned char *to;
 710
 711   while (p < endp && *p < 0x80) p++;
 712   if (p == endp)
 713     return bytes;
 714   to = p;
 715   bytes = endp - p;
 716   endp = str + len;
 717   memmove (endp - bytes, p, bytes);
 718   p = endp - bytes;
 719   while (p < endp)
 720     {
 721       int c = *p++;
 722
 723       if (c >= 0x80)
 724         c = BYTE8_TO_CHAR (c);
 725       to += CHAR_STRING (c, to);
 726     }
 727   return (to - str);
 728 }
 729
 730 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 731    actually converts characters in the range 0x80..0xFF to
 732    unibyte.  */
 733
 734 ptrdiff_t
 735 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 736 {
 737   const unsigned char *p = str, *endp = str + bytes;
 738   unsigned char *to;
 739   int c, len;
 740
 741   while (p < endp)
 742     {
 743       c = *p;
 744       len = BYTES_BY_CHAR_HEAD (c);
 745       if (CHAR_BYTE8_HEAD_P (c))
 746         break;
 747       p += len;
 748     }
 749   to = str + (p - str);
 750   while (p < endp)
 751     {
 752       c = *p;
 753       len = BYTES_BY_CHAR_HEAD (c);
 754       if (CHAR_BYTE8_HEAD_P (c))
 755         {
 756           c = STRING_CHAR_ADVANCE (p);
 757           *to++ = CHAR_TO_BYTE8 (c);
 758         }
 759       else
 760         {
 761           while (len--) *to++ = *p++;
 762         }
 763     }
 764   return (to - str);
 765 }
 766
 767 /* Convert eight-bit chars in SRC (in multibyte form) to the
 768    corresponding byte and store in DST.  CHARS is the number of
 769    characters in SRC.  The value is the number of bytes stored in DST.
 770    Usually, the value is the same as CHARS, but is less than it if SRC
 771    contains a non-ASCII, non-eight-bit character.  */
 772
 773 ptrdiff_t
 774 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 775 {
 776   ptrdiff_t i;
 777
 778   for (i = 0; i < chars; i++)
 779     {
 780       int c = STRING_CHAR_ADVANCE (src);
 781
 782       if (CHAR_BYTE8_P (c))
 783         c = CHAR_TO_BYTE8 (c);
 784       else if (! ASCII_CHAR_P (c))
 785         return i;
 786       *dst++ = c;
 787     }
 788   return i;
 789 }
 790
 791
 792 static ptrdiff_t
 793 string_count_byte8 (Lisp_Object string)
 794 {
 795   bool multibyte = STRING_MULTIBYTE (string);
 796   ptrdiff_t nbytes = SBYTES (string);
 797   unsigned char *p = SDATA (string);
 798   unsigned char *pend = p + nbytes;
 799   ptrdiff_t count = 0;
 800   int c, len;
 801
 802   if (multibyte)
 803     while (p < pend)
 804       {
 805         c = *p;
 806         len = BYTES_BY_CHAR_HEAD (c);
 807
 808         if (CHAR_BYTE8_HEAD_P (c))
 809           count++;
 810         p += len;
 811       }
 812   else
 813     while (p < pend)
 814       {
 815         if (*p++ >= 0x80)
 816           count++;
 817       }
 818   return count;
 819 }
 820
 821
 822 Lisp_Object
 823 string_escape_byte8 (Lisp_Object string)
 824 {
 825   ptrdiff_t nchars = SCHARS (string);
 826   ptrdiff_t nbytes = SBYTES (string);
 827   bool multibyte = STRING_MULTIBYTE (string);
 828   ptrdiff_t byte8_count;
 829   const unsigned char *src, *src_end;
 830   unsigned char *dst;
 831   Lisp_Object val;
 832   int c, len;
 833
 834   if (multibyte && nchars == nbytes)
 835     return string;
 836
 837   byte8_count = string_count_byte8 (string);
 838
 839   if (byte8_count == 0)
 840     return string;
 841
 842   if (multibyte)
 843     {
 844       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 845           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 846         string_overflow ();
 847
 848       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 849       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 850                                           nbytes + byte8_count * 2);
 851     }
 852   else
 853     {
 854       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 855         string_overflow ();
 856
 857       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 858       val = make_uninit_string (nbytes + byte8_count * 3);
 859     }
 860
 861   src = SDATA (string);
 862   src_end = src + nbytes;
 863   dst = SDATA (val);
 864   if (multibyte)
 865     while (src < src_end)
 866       {
 867         c = *src;
 868         len = BYTES_BY_CHAR_HEAD (c);
 869
 870         if (CHAR_BYTE8_HEAD_P (c))
 871           {
 872             c = STRING_CHAR_ADVANCE (src);
 873             c = CHAR_TO_BYTE8 (c);
 874             dst += sprintf ((char *) dst, "\\%03o", c);
 875           }
 876         else
 877           while (len--) *dst++ = *src++;
 878       }
 879   else
 880     while (src < src_end)
 881       {
 882         c = *src++;
 883         if (c >= 0x80)
 884           dst += sprintf ((char *) dst, "\\%03o", c);
 885         else
 886           *dst++ = c;
 887       }
 888   return val;
 889 }
 890
 891 \f
 892 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 893        doc: /*
 894 Concatenate all the argument characters and make the result a string.
 895 usage: (string &rest CHARACTERS)  */)
 896   (ptrdiff_t n, Lisp_Object *args)
 897 {
 898   ptrdiff_t i;
 899   int c;
 900   unsigned char *buf, *p;
 901   Lisp_Object str;
 902   USE_SAFE_ALLOCA;
 903
 904   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 905   p = buf;
 906
 907   for (i = 0; i < n; i++)
 908     {
 909       CHECK_CHARACTER (args[i]);
 910       c = XINT (args[i]);
 911       p += CHAR_STRING (c, p);
 912     }
 913
 914   str = make_string_from_bytes ((char *) buf, n, p - buf);
 915   SAFE_FREE ();
 916   return str;
 917 }
 918
 919 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 920        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 921 usage: (unibyte-string &rest BYTES)  */)
 922   (ptrdiff_t n, Lisp_Object *args)
 923 {
 924   ptrdiff_t i;
 925   Lisp_Object str;
 926   USE_SAFE_ALLOCA;
 927   unsigned char *buf = SAFE_ALLOCA (n);
 928   unsigned char *p = buf;
 929
 930   for (i = 0; i < n; i++)
 931     {
 932       CHECK_RANGED_INTEGER (args[i], 0, 255);
 933       *p++ = XINT (args[i]);
 934     }
 935
 936   str = make_string_from_bytes ((char *) buf, n, p - buf);
 937   SAFE_FREE ();
 938   return str;
 939 }
 940
 941 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 942        Schar_resolve_modifiers, 1, 1, 0,
 943        doc: /* Resolve modifiers in the character CHAR.
 944 The value is a character with modifiers resolved into the character
 945 code.  Unresolved modifiers are kept in the value.
 946 usage: (char-resolve-modifiers CHAR)  */)
 947   (Lisp_Object character)
 948 {
 949   EMACS_INT c;
 950
 951   CHECK_NUMBER (character);
 952   c = XINT (character);
 953   return make_number (char_resolve_modifier_mask (c));
 954 }
 955
 956 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 957        doc: /* Return a byte value of a character at point.
 958 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 959 a byte value.
 960 Optional 2nd arg STRING, if non-nil, is a string of which first
 961 character is a target to get a byte value.  In this case, POSITION, if
 962 non-nil, is an index of a target character in the string.
 963
 964 If the current buffer (or STRING) is multibyte, and the target
 965 character is not ASCII nor 8-bit character, an error is signaled.  */)
 966   (Lisp_Object position, Lisp_Object string)
 967 {
 968   int c;
 969   ptrdiff_t pos;
 970   unsigned char *p;
 971
 972   if (NILP (string))
 973     {
 974       if (NILP (position))
 975         {
 976           p = PT_ADDR;
 977         }
 978       else
 979         {
 980           CHECK_NUMBER_COERCE_MARKER (position);
 981           if (XINT (position) < BEGV || XINT (position) >= ZV)
 982             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 983           pos = XFASTINT (position);
 984           p = CHAR_POS_ADDR (pos);
 985         }
 986       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 987         return make_number (*p);
 988     }
 989   else
 990     {
 991       CHECK_STRING (string);
 992       if (NILP (position))
 993         {
 994           p = SDATA (string);
 995         }
 996       else
 997         {
 998           CHECK_NATNUM (position);
 999           if (XINT (position) >= SCHARS (string))
1000             args_out_of_range (string, position);
1001           pos = XFASTINT (position);
1002           p = SDATA (string) + string_char_to_byte (string, pos);
1003         }
1004       if (! STRING_MULTIBYTE (string))
1005         return make_number (*p);
1006     }
1007   c = STRING_CHAR (p);
1008   if (CHAR_BYTE8_P (c))
1009     c = CHAR_TO_BYTE8 (c);
1010   else if (! ASCII_CHAR_P (c))
1011     error ("Not an ASCII nor an 8-bit character: %d", c);
1012   return make_number (c);
1013 }
1014
1015 #ifdef emacs
1016
1017 void
1018 syms_of_character (void)
1019 {
1020   DEFSYM (Qcharacterp, "characterp");
1021   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1022
1023   staticpro (&Vchar_unify_table);
1024   Vchar_unify_table = Qnil;
1025
1026   defsubr (&Smax_char);
1027   defsubr (&Scharacterp);
1028   defsubr (&Sunibyte_char_to_multibyte);
1029   defsubr (&Smultibyte_char_to_unibyte);
1030   defsubr (&Schar_width);
1031   defsubr (&Sstring_width);
1032   defsubr (&Sstring);
1033   defsubr (&Sunibyte_string);
1034   defsubr (&Schar_resolve_modifiers);
1035   defsubr (&Sget_byte);
1036
1037   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1038                doc: /*
1039 Vector recording all translation tables ever defined.
1040 Each element is a pair (SYMBOL . TABLE) relating the table to the
1041 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1042   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1043
1044   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1045                doc: /*
1046 A char-table for characters which invoke auto-filling.
1047 Such characters have value t in this table.  */);
1048   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1049   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1050   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1051
1052   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1053                doc: /*
1054 A char-table for width (columns) of each character.  */);
1055   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1056   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1057   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1058                         make_number (4));
1059
1060   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1061                doc: /* A char-table for each printable character.  */);
1062   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1063   Fset_char_table_range (Vprintable_chars,
1064                          Fcons (make_number (32), make_number (126)), Qt);
1065   Fset_char_table_range (Vprintable_chars,
1066                          Fcons (make_number (160),
1067                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1068
1069   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1070                doc: /* Char table of script symbols.
1071 It has one extra slot whose value is a list of script symbols.  */);
1072
1073   DEFSYM (Qchar_script_table, "char-script-table");
1074   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1075   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1076
1077   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1078                doc: /* Alist of scripts vs the representative characters.
1079 Each element is a cons (SCRIPT . CHARS).
1080 SCRIPT is a symbol representing a script or a subgroup of a script.
1081 CHARS is a list or a vector of characters.
1082 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1083 If it is a vector, one of the characters in the vector is necessary.
1084 This variable is used to find a font for a specific script.  */);
1085   Vscript_representative_chars = Qnil;
1086
1087   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1088                doc: /* Char table of Unicode's "General Category".
1089 All Unicode characters have one of the following values (symbol):
1090   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1091   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1092 See The Unicode Standard for the meaning of those values.  */);
1093   /* The correct char-table is setup in characters.el.  */
1094   Vunicode_category_table = Qnil;
1095 }
1096
1097 #endif /* emacs */