src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #define CHARACTER_INLINE EXTERN_INLINE
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include <intprops.h>
  40 #include "lisp.h"
  41 #include "character.h"
  42 #include "buffer.h"
  43 #include "charset.h"
  44 #include "composite.h"
  45 #include "disptab.h"
  46
  47 #else  /* not emacs */
  48
  49 #include "mulelib.h"
  50
  51 #endif /* emacs */
  52
  53 Lisp_Object Qcharacterp;
  54
  55 static Lisp_Object Qauto_fill_chars;
  56
  57 /* Char-table of information about which character to unify to which
  58    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  59 Lisp_Object Vchar_unify_table;
  60
  61 static Lisp_Object Qchar_script_table;
  62
  63 \f
  64
  65 /* If character code C has modifier masks, reflect them to the
  66    character code if possible.  Return the resulting code.  */
  67
  68 EMACS_INT
  69 char_resolve_modifier_mask (EMACS_INT c)
  70 {
  71   /* A non-ASCII character can't reflect modifier bits to the code.  */
  72   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  73     return c;
  74
  75   /* For Meta, Shift, and Control modifiers, we need special care.  */
  76   if (c & CHAR_SHIFT)
  77     {
  78       /* Shift modifier is valid only with [A-Za-z].  */
  79       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  80         c &= ~CHAR_SHIFT;
  81       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  82         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  83       /* Shift modifier for control characters and SPC is ignored.  */
  84       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  85         c &= ~CHAR_SHIFT;
  86     }
  87   if (c & CHAR_CTL)
  88     {
  89       /* Simulate the code in lread.c.  */
  90       /* Allow `\C- ' and `\C-?'.  */
  91       if ((c & 0377) == ' ')
  92         c &= ~0177 & ~ CHAR_CTL;
  93       else if ((c & 0377) == '?')
  94         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  95       /* ASCII control chars are made from letters (both cases),
  96          as well as the non-letters within 0100...0137.  */
  97       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  98         c &= (037 | (~0177 & ~CHAR_CTL));
  99       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101     }
 102 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 103   if (c & CHAR_META)
 104     {
 105       /* Move the meta bit to the right place for a string.  */
 106       c = (c & ~CHAR_META) | 0x80;
 107     }
 108 #endif
 109
 110   return c;
 111 }
 112
 113
 114 /* Store multibyte form of character C at P.  If C has modifier bits,
 115    handle them appropriately.  */
 116
 117 int
 118 char_string (unsigned int c, unsigned char *p)
 119 {
 120   int bytes;
 121
 122   if (c & CHAR_MODIFIER_MASK)
 123     {
 124       c = char_resolve_modifier_mask (c);
 125       /* If C still has any modifier bits, just ignore it.  */
 126       c &= ~CHAR_MODIFIER_MASK;
 127     }
 128
 129   MAYBE_UNIFY_CHAR (c);
 130
 131   if (c <= MAX_3_BYTE_CHAR)
 132     {
 133       bytes = CHAR_STRING (c, p);
 134     }
 135   else if (c <= MAX_4_BYTE_CHAR)
 136     {
 137       p[0] = (0xF0 | (c >> 18));
 138       p[1] = (0x80 | ((c >> 12) & 0x3F));
 139       p[2] = (0x80 | ((c >> 6) & 0x3F));
 140       p[3] = (0x80 | (c & 0x3F));
 141       bytes = 4;
 142     }
 143   else if (c <= MAX_5_BYTE_CHAR)
 144     {
 145       p[0] = 0xF8;
 146       p[1] = (0x80 | ((c >> 18) & 0x0F));
 147       p[2] = (0x80 | ((c >> 12) & 0x3F));
 148       p[3] = (0x80 | ((c >> 6) & 0x3F));
 149       p[4] = (0x80 | (c & 0x3F));
 150       bytes = 5;
 151     }
 152   else if (c <= MAX_CHAR)
 153     {
 154       c = CHAR_TO_BYTE8 (c);
 155       bytes = BYTE8_STRING (c, p);
 156     }
 157   else
 158     error ("Invalid character: %x", c);
 159
 160   return bytes;
 161 }
 162
 163
 164 /* Return a character whose multibyte form is at P.  If LEN is not
 165    NULL, it must be a pointer to integer.  In that case, set *LEN to
 166    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 167    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 168    the ending address (i.e., the starting address of the next
 169    character) of the multibyte form.  */
 170
 171 int
 172 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 173 {
 174   int c;
 175   const unsigned char *saved_p = p;
 176
 177   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 178     {
 179       c = STRING_CHAR_ADVANCE (p);
 180     }
 181   else if (! (*p & 0x08))
 182     {
 183       c = ((((p)[0] & 0xF) << 18)
 184            | (((p)[1] & 0x3F) << 12)
 185            | (((p)[2] & 0x3F) << 6)
 186            | ((p)[3] & 0x3F));
 187       p += 4;
 188     }
 189   else
 190     {
 191       c = ((((p)[1] & 0x3F) << 18)
 192            | (((p)[2] & 0x3F) << 12)
 193            | (((p)[3] & 0x3F) << 6)
 194            | ((p)[4] & 0x3F));
 195       p += 5;
 196     }
 197
 198   MAYBE_UNIFY_CHAR (c);
 199
 200   if (len)
 201     *len = p - saved_p;
 202   if (advanced)
 203     *advanced = p;
 204   return c;
 205 }
 206
 207
 208 /* Translate character C by translation table TABLE.  If no translation is
 209    found in TABLE, return the untranslated character.  If TABLE is a list,
 210    elements are char tables.  In that case, recursively translate C by all the
 211    tables in the list.  */
 212
 213 int
 214 translate_char (Lisp_Object table, int c)
 215 {
 216   if (CHAR_TABLE_P (table))
 217     {
 218       Lisp_Object ch;
 219
 220       ch = CHAR_TABLE_REF (table, c);
 221       if (CHARACTERP (ch))
 222         c = XINT (ch);
 223     }
 224   else
 225     {
 226       for (; CONSP (table); table = XCDR (table))
 227         c = translate_char (XCAR (table), c);
 228     }
 229   return c;
 230 }
 231
 232 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 233    them, return (C & 0xFF).  */
 234
 235 int
 236 multibyte_char_to_unibyte (int c)
 237 {
 238   if (c < 0x80)
 239     return c;
 240   if (CHAR_BYTE8_P (c))
 241     return CHAR_TO_BYTE8 (c);
 242   return (c & 0xFF);
 243 }
 244
 245 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 246    by charset_unibyte.  */
 247
 248 int
 249 multibyte_char_to_unibyte_safe (int c)
 250 {
 251   if (c < 0x80)
 252     return c;
 253   if (CHAR_BYTE8_P (c))
 254     return CHAR_TO_BYTE8 (c);
 255   return -1;
 256 }
 257
 258 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 259        doc: /* Return non-nil if OBJECT is a character.
 260 In Emacs Lisp, characters are represented by character codes, which
 261 are non-negative integers.  The function `max-char' returns the
 262 maximum character code.
 263 usage: (characterp OBJECT)  */)
 264   (Lisp_Object object, Lisp_Object ignore)
 265 {
 266   return (CHARACTERP (object) ? Qt : Qnil);
 267 }
 268
 269 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 270        doc: /* Return the character of the maximum code.  */)
 271   (void)
 272 {
 273   return make_number (MAX_CHAR);
 274 }
 275
 276 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 277        Sunibyte_char_to_multibyte, 1, 1, 0,
 278        doc: /* Convert the byte CH to multibyte character.  */)
 279   (Lisp_Object ch)
 280 {
 281   int c;
 282
 283   CHECK_CHARACTER (ch);
 284   c = XFASTINT (ch);
 285   if (c >= 0x100)
 286     error ("Not a unibyte character: %d", c);
 287   MAKE_CHAR_MULTIBYTE (c);
 288   return make_number (c);
 289 }
 290
 291 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 292        Smultibyte_char_to_unibyte, 1, 1, 0,
 293        doc: /* Convert the multibyte character CH to a byte.
 294 If the multibyte character does not represent a byte, return -1.  */)
 295   (Lisp_Object ch)
 296 {
 297   int cm;
 298
 299   CHECK_CHARACTER (ch);
 300   cm = XFASTINT (ch);
 301   if (cm < 256)
 302     /* Can't distinguish a byte read from a unibyte buffer from
 303        a latin1 char, so let's let it slide.  */
 304     return ch;
 305   else
 306     {
 307       int cu = CHAR_TO_BYTE_SAFE (cm);
 308       return make_number (cu);
 309     }
 310 }
 311
 312
 313 /* Return width (columns) of C considering the buffer display table DP. */
 314
 315 static ptrdiff_t
 316 char_width (int c, struct Lisp_Char_Table *dp)
 317 {
 318   ptrdiff_t width = CHAR_WIDTH (c);
 319
 320   if (dp)
 321     {
 322       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 323       int i;
 324
 325       if (VECTORP (disp))
 326         for (i = 0, width = 0; i < ASIZE (disp); i++)
 327           {
 328             ch = AREF (disp, i);
 329             if (CHARACTERP (ch))
 330               {
 331                 int w = CHAR_WIDTH (XFASTINT (ch));
 332                 if (INT_ADD_OVERFLOW (width, w))
 333                   string_overflow ();
 334                 width += w;
 335               }
 336           }
 337     }
 338   return width;
 339 }
 340
 341
 342 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 343        doc: /* Return width of CHAR when displayed in the current buffer.
 344 The width is measured by how many columns it occupies on the screen.
 345 Tab is taken to occupy `tab-width' columns.
 346 usage: (char-width CHAR)  */)
 347   (Lisp_Object ch)
 348 {
 349   int c;
 350   ptrdiff_t width;
 351
 352   CHECK_CHARACTER (ch);
 353   c = XINT (ch);
 354   width = char_width (c, buffer_display_table ());
 355   return make_number (width);
 356 }
 357
 358 /* Return width of string STR of length LEN when displayed in the
 359    current buffer.  The width is measured by how many columns it
 360    occupies on the screen.  If PRECISION > 0, return the width of
 361    longest substring that doesn't exceed PRECISION, and set number of
 362    characters and bytes of the substring in *NCHARS and *NBYTES
 363    respectively.  */
 364
 365 ptrdiff_t
 366 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 367                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 368 {
 369   ptrdiff_t i = 0, i_byte = 0;
 370   ptrdiff_t width = 0;
 371   struct Lisp_Char_Table *dp = buffer_display_table ();
 372
 373   while (i_byte < len)
 374     {
 375       int bytes;
 376       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 377       ptrdiff_t thiswidth = char_width (c, dp);
 378
 379       if (precision <= 0)
 380         {
 381           if (INT_ADD_OVERFLOW (width, thiswidth))
 382             string_overflow ();
 383         }
 384       else if (precision - width < thiswidth)
 385         {
 386           *nchars = i;
 387           *nbytes = i_byte;
 388           return width;
 389         }
 390       i++;
 391       i_byte += bytes;
 392       width += thiswidth;
 393   }
 394
 395   if (precision > 0)
 396     {
 397       *nchars = i;
 398       *nbytes = i_byte;
 399     }
 400
 401   return width;
 402 }
 403
 404 /* Return width of string STR of length LEN when displayed in the
 405    current buffer.  The width is measured by how many columns it
 406    occupies on the screen.  */
 407
 408 ptrdiff_t
 409 strwidth (const char *str, ptrdiff_t len)
 410 {
 411   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 412 }
 413
 414 /* Return width of Lisp string STRING when displayed in the current
 415    buffer.  The width is measured by how many columns it occupies on
 416    the screen while paying attention to compositions.  If PRECISION >
 417    0, return the width of longest substring that doesn't exceed
 418    PRECISION, and set number of characters and bytes of the substring
 419    in *NCHARS and *NBYTES respectively.  */
 420
 421 ptrdiff_t
 422 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 423                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 424 {
 425   ptrdiff_t len = SCHARS (string);
 426   /* This set multibyte to 0 even if STRING is multibyte when it
 427      contains only ascii and eight-bit-graphic, but that's
 428      intentional.  */
 429   bool multibyte = len < SBYTES (string);
 430   unsigned char *str = SDATA (string);
 431   ptrdiff_t i = 0, i_byte = 0;
 432   ptrdiff_t width = 0;
 433   struct Lisp_Char_Table *dp = buffer_display_table ();
 434
 435   while (i < len)
 436     {
 437       ptrdiff_t chars, bytes, thiswidth;
 438       Lisp_Object val;
 439       ptrdiff_t cmp_id;
 440       ptrdiff_t ignore, end;
 441
 442       if (find_composition (i, -1, &ignore, &end, &val, string)
 443           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 444               >= 0))
 445         {
 446           thiswidth = composition_table[cmp_id]->width;
 447           chars = end - i;
 448           bytes = string_char_to_byte (string, end) - i_byte;
 449         }
 450       else
 451         {
 452           int c;
 453
 454           if (multibyte)
 455             {
 456               int cbytes;
 457               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 458               bytes = cbytes;
 459             }
 460           else
 461             c = str[i_byte], bytes = 1;
 462           chars = 1;
 463           thiswidth = char_width (c, dp);
 464         }
 465
 466       if (precision <= 0)
 467         {
 468 #ifdef emacs
 469           if (INT_ADD_OVERFLOW (width, thiswidth))
 470             string_overflow ();
 471 #endif
 472         }
 473       else if (precision - width < thiswidth)
 474         {
 475           *nchars = i;
 476           *nbytes = i_byte;
 477           return width;
 478         }
 479       i += chars;
 480       i_byte += bytes;
 481       width += thiswidth;
 482     }
 483
 484   if (precision > 0)
 485     {
 486       *nchars = i;
 487       *nbytes = i_byte;
 488     }
 489
 490   return width;
 491 }
 492
 493 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 494        doc: /* Return width of STRING when displayed in the current buffer.
 495 Width is measured by how many columns it occupies on the screen.
 496 When calculating width of a multibyte character in STRING,
 497 only the base leading-code is considered; the validity of
 498 the following bytes is not checked.  Tabs in STRING are always
 499 taken to occupy `tab-width' columns.
 500 usage: (string-width STRING)  */)
 501   (Lisp_Object str)
 502 {
 503   Lisp_Object val;
 504
 505   CHECK_STRING (str);
 506   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 507   return val;
 508 }
 509
 510 /* Return the number of characters in the NBYTES bytes at PTR.
 511    This works by looking at the contents and checking for multibyte
 512    sequences while assuming that there's no invalid sequence.
 513    However, if the current buffer has enable-multibyte-characters =
 514    nil, we treat each byte as a character.  */
 515
 516 ptrdiff_t
 517 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 518 {
 519   /* current_buffer is null at early stages of Emacs initialization.  */
 520   if (current_buffer == 0
 521       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 522     return nbytes;
 523
 524   return multibyte_chars_in_text (ptr, nbytes);
 525 }
 526
 527 /* Return the number of characters in the NBYTES bytes at PTR.
 528    This works by looking at the contents and checking for multibyte
 529    sequences while assuming that there's no invalid sequence.  It
 530    ignores enable-multibyte-characters.  */
 531
 532 ptrdiff_t
 533 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 534 {
 535   const unsigned char *endp = ptr + nbytes;
 536   ptrdiff_t chars = 0;
 537
 538   while (ptr < endp)
 539     {
 540       int len = MULTIBYTE_LENGTH (ptr, endp);
 541
 542       if (len == 0)
 543         emacs_abort ();
 544       ptr += len;
 545       chars++;
 546     }
 547
 548   return chars;
 549 }
 550
 551 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 552    characters and bytes in it, and store them in *NCHARS and *NBYTES
 553    respectively.  On counting bytes, pay attention to that 8-bit
 554    characters not constructing a valid multibyte sequence are
 555    represented by 2-byte in a multibyte text.  */
 556
 557 void
 558 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 559                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 560 {
 561   const unsigned char *endp = str + len;
 562   int n;
 563   ptrdiff_t chars = 0, bytes = 0;
 564
 565   if (len >= MAX_MULTIBYTE_LENGTH)
 566     {
 567       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 568       while (str < adjusted_endp)
 569         {
 570           if (! CHAR_BYTE8_HEAD_P (*str)
 571               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 572             str += n, bytes += n;
 573           else
 574             str++, bytes += 2;
 575           chars++;
 576         }
 577     }
 578   while (str < endp)
 579     {
 580       if (! CHAR_BYTE8_HEAD_P (*str)
 581           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 582         str += n, bytes += n;
 583       else
 584         str++, bytes += 2;
 585       chars++;
 586     }
 587
 588   *nchars = chars;
 589   *nbytes = bytes;
 590   return;
 591 }
 592
 593 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 594    It actually converts only such 8-bit characters that don't construct
 595    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 596    NCHARS is nonzero, set *NCHARS to the number of characters in the
 597    text.  It is assured that we can use LEN bytes at STR as a work
 598    area and that is enough.  Return the number of bytes of the
 599    resulting text.  */
 600
 601 ptrdiff_t
 602 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 603                   ptrdiff_t *nchars)
 604 {
 605   unsigned char *p = str, *endp = str + nbytes;
 606   unsigned char *to;
 607   ptrdiff_t chars = 0;
 608   int n;
 609
 610   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 611     {
 612       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 613       while (p < adjusted_endp
 614              && ! CHAR_BYTE8_HEAD_P (*p)
 615              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 616         p += n, chars++;
 617     }
 618   while (p < endp
 619          && ! CHAR_BYTE8_HEAD_P (*p)
 620          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 621     p += n, chars++;
 622   if (nchars)
 623     *nchars = chars;
 624   if (p == endp)
 625     return nbytes;
 626
 627   to = p;
 628   nbytes = endp - p;
 629   endp = str + len;
 630   memmove (endp - nbytes, p, nbytes);
 631   p = endp - nbytes;
 632
 633   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 634     {
 635       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 636       while (p < adjusted_endp)
 637         {
 638           if (! CHAR_BYTE8_HEAD_P (*p)
 639               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 640             {
 641               while (n--)
 642                 *to++ = *p++;
 643             }
 644           else
 645             {
 646               int c = *p++;
 647               c = BYTE8_TO_CHAR (c);
 648               to += CHAR_STRING (c, to);
 649             }
 650         }
 651       chars++;
 652     }
 653   while (p < endp)
 654     {
 655       if (! CHAR_BYTE8_HEAD_P (*p)
 656           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 657         {
 658           while (n--)
 659             *to++ = *p++;
 660         }
 661       else
 662         {
 663           int c = *p++;
 664           c = BYTE8_TO_CHAR (c);
 665           to += CHAR_STRING (c, to);
 666         }
 667       chars++;
 668     }
 669   if (nchars)
 670     *nchars = chars;
 671   return (to - str);
 672 }
 673
 674 /* Parse unibyte string at STR of LEN bytes, and return the number of
 675    bytes it may occupy when converted to multibyte string by
 676    `str_to_multibyte'.  */
 677
 678 ptrdiff_t
 679 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 680 {
 681   const unsigned char *endp = str + len;
 682   ptrdiff_t bytes;
 683
 684   for (bytes = 0; str < endp; str++)
 685     {
 686       int n = *str < 0x80 ? 1 : 2;
 687       if (INT_ADD_OVERFLOW (bytes, n))
 688         string_overflow ();
 689       bytes += n;
 690     }
 691   return bytes;
 692 }
 693
 694
 695 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 696    that contains the same single-byte characters.  It actually
 697    converts all 8-bit characters to multibyte forms.  It is assured
 698    that we can use LEN bytes at STR as a work area and that is
 699    enough.  */
 700
 701 ptrdiff_t
 702 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 703 {
 704   unsigned char *p = str, *endp = str + bytes;
 705   unsigned char *to;
 706
 707   while (p < endp && *p < 0x80) p++;
 708   if (p == endp)
 709     return bytes;
 710   to = p;
 711   bytes = endp - p;
 712   endp = str + len;
 713   memmove (endp - bytes, p, bytes);
 714   p = endp - bytes;
 715   while (p < endp)
 716     {
 717       int c = *p++;
 718
 719       if (c >= 0x80)
 720         c = BYTE8_TO_CHAR (c);
 721       to += CHAR_STRING (c, to);
 722     }
 723   return (to - str);
 724 }
 725
 726 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 727    actually converts characters in the range 0x80..0xFF to
 728    unibyte.  */
 729
 730 ptrdiff_t
 731 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 732 {
 733   const unsigned char *p = str, *endp = str + bytes;
 734   unsigned char *to;
 735   int c, len;
 736
 737   while (p < endp)
 738     {
 739       c = *p;
 740       len = BYTES_BY_CHAR_HEAD (c);
 741       if (CHAR_BYTE8_HEAD_P (c))
 742         break;
 743       p += len;
 744     }
 745   to = str + (p - str);
 746   while (p < endp)
 747     {
 748       c = *p;
 749       len = BYTES_BY_CHAR_HEAD (c);
 750       if (CHAR_BYTE8_HEAD_P (c))
 751         {
 752           c = STRING_CHAR_ADVANCE (p);
 753           *to++ = CHAR_TO_BYTE8 (c);
 754         }
 755       else
 756         {
 757           while (len--) *to++ = *p++;
 758         }
 759     }
 760   return (to - str);
 761 }
 762
 763 /* Convert eight-bit chars in SRC (in multibyte form) to the
 764    corresponding byte and store in DST.  CHARS is the number of
 765    characters in SRC.  The value is the number of bytes stored in DST.
 766    Usually, the value is the same as CHARS, but is less than it if SRC
 767    contains a non-ASCII, non-eight-bit character.  */
 768
 769 ptrdiff_t
 770 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 771 {
 772   ptrdiff_t i;
 773
 774   for (i = 0; i < chars; i++)
 775     {
 776       int c = STRING_CHAR_ADVANCE (src);
 777
 778       if (CHAR_BYTE8_P (c))
 779         c = CHAR_TO_BYTE8 (c);
 780       else if (! ASCII_CHAR_P (c))
 781         return i;
 782       *dst++ = c;
 783     }
 784   return i;
 785 }
 786
 787
 788 static ptrdiff_t
 789 string_count_byte8 (Lisp_Object string)
 790 {
 791   bool multibyte = STRING_MULTIBYTE (string);
 792   ptrdiff_t nbytes = SBYTES (string);
 793   unsigned char *p = SDATA (string);
 794   unsigned char *pend = p + nbytes;
 795   ptrdiff_t count = 0;
 796   int c, len;
 797
 798   if (multibyte)
 799     while (p < pend)
 800       {
 801         c = *p;
 802         len = BYTES_BY_CHAR_HEAD (c);
 803
 804         if (CHAR_BYTE8_HEAD_P (c))
 805           count++;
 806         p += len;
 807       }
 808   else
 809     while (p < pend)
 810       {
 811         if (*p++ >= 0x80)
 812           count++;
 813       }
 814   return count;
 815 }
 816
 817
 818 Lisp_Object
 819 string_escape_byte8 (Lisp_Object string)
 820 {
 821   ptrdiff_t nchars = SCHARS (string);
 822   ptrdiff_t nbytes = SBYTES (string);
 823   bool multibyte = STRING_MULTIBYTE (string);
 824   ptrdiff_t byte8_count;
 825   const unsigned char *src, *src_end;
 826   unsigned char *dst;
 827   Lisp_Object val;
 828   int c, len;
 829
 830   if (multibyte && nchars == nbytes)
 831     return string;
 832
 833   byte8_count = string_count_byte8 (string);
 834
 835   if (byte8_count == 0)
 836     return string;
 837
 838   if (multibyte)
 839     {
 840       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 841           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 842         string_overflow ();
 843
 844       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 845       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 846                                           nbytes + byte8_count * 2);
 847     }
 848   else
 849     {
 850       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 851         string_overflow ();
 852
 853       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 854       val = make_uninit_string (nbytes + byte8_count * 3);
 855     }
 856
 857   src = SDATA (string);
 858   src_end = src + nbytes;
 859   dst = SDATA (val);
 860   if (multibyte)
 861     while (src < src_end)
 862       {
 863         c = *src;
 864         len = BYTES_BY_CHAR_HEAD (c);
 865
 866         if (CHAR_BYTE8_HEAD_P (c))
 867           {
 868             c = STRING_CHAR_ADVANCE (src);
 869             c = CHAR_TO_BYTE8 (c);
 870             dst += sprintf ((char *) dst, "\\%03o", c);
 871           }
 872         else
 873           while (len--) *dst++ = *src++;
 874       }
 875   else
 876     while (src < src_end)
 877       {
 878         c = *src++;
 879         if (c >= 0x80)
 880           dst += sprintf ((char *) dst, "\\%03o", c);
 881         else
 882           *dst++ = c;
 883       }
 884   return val;
 885 }
 886
 887 \f
 888 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 889        doc: /*
 890 Concatenate all the argument characters and make the result a string.
 891 usage: (string &rest CHARACTERS)  */)
 892   (ptrdiff_t n, Lisp_Object *args)
 893 {
 894   ptrdiff_t i;
 895   int c;
 896   unsigned char *buf, *p;
 897   Lisp_Object str;
 898   USE_SAFE_ALLOCA;
 899
 900   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 901   p = buf;
 902
 903   for (i = 0; i < n; i++)
 904     {
 905       CHECK_CHARACTER (args[i]);
 906       c = XINT (args[i]);
 907       p += CHAR_STRING (c, p);
 908     }
 909
 910   str = make_string_from_bytes ((char *) buf, n, p - buf);
 911   SAFE_FREE ();
 912   return str;
 913 }
 914
 915 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 916        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 917 usage: (unibyte-string &rest BYTES)  */)
 918   (ptrdiff_t n, Lisp_Object *args)
 919 {
 920   ptrdiff_t i;
 921   Lisp_Object str;
 922   USE_SAFE_ALLOCA;
 923   unsigned char *buf = SAFE_ALLOCA (n);
 924   unsigned char *p = buf;
 925
 926   for (i = 0; i < n; i++)
 927     {
 928       CHECK_RANGED_INTEGER (args[i], 0, 255);
 929       *p++ = XINT (args[i]);
 930     }
 931
 932   str = make_string_from_bytes ((char *) buf, n, p - buf);
 933   SAFE_FREE ();
 934   return str;
 935 }
 936
 937 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 938        Schar_resolve_modifiers, 1, 1, 0,
 939        doc: /* Resolve modifiers in the character CHAR.
 940 The value is a character with modifiers resolved into the character
 941 code.  Unresolved modifiers are kept in the value.
 942 usage: (char-resolve-modifiers CHAR)  */)
 943   (Lisp_Object character)
 944 {
 945   EMACS_INT c;
 946
 947   CHECK_NUMBER (character);
 948   c = XINT (character);
 949   return make_number (char_resolve_modifier_mask (c));
 950 }
 951
 952 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 953        doc: /* Return a byte value of a character at point.
 954 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 955 a byte value.
 956 Optional 2nd arg STRING, if non-nil, is a string of which first
 957 character is a target to get a byte value.  In this case, POSITION, if
 958 non-nil, is an index of a target character in the string.
 959
 960 If the current buffer (or STRING) is multibyte, and the target
 961 character is not ASCII nor 8-bit character, an error is signaled.  */)
 962   (Lisp_Object position, Lisp_Object string)
 963 {
 964   int c;
 965   ptrdiff_t pos;
 966   unsigned char *p;
 967
 968   if (NILP (string))
 969     {
 970       if (NILP (position))
 971         {
 972           p = PT_ADDR;
 973         }
 974       else
 975         {
 976           CHECK_NUMBER_COERCE_MARKER (position);
 977           if (XINT (position) < BEGV || XINT (position) >= ZV)
 978             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 979           pos = XFASTINT (position);
 980           p = CHAR_POS_ADDR (pos);
 981         }
 982       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 983         return make_number (*p);
 984     }
 985   else
 986     {
 987       CHECK_STRING (string);
 988       if (NILP (position))
 989         {
 990           p = SDATA (string);
 991         }
 992       else
 993         {
 994           CHECK_NATNUM (position);
 995           if (XINT (position) >= SCHARS (string))
 996             args_out_of_range (string, position);
 997           pos = XFASTINT (position);
 998           p = SDATA (string) + string_char_to_byte (string, pos);
 999         }
1000       if (! STRING_MULTIBYTE (string))
1001         return make_number (*p);
1002     }
1003   c = STRING_CHAR (p);
1004   if (CHAR_BYTE8_P (c))
1005     c = CHAR_TO_BYTE8 (c);
1006   else if (! ASCII_CHAR_P (c))
1007     error ("Not an ASCII nor an 8-bit character: %d", c);
1008   return make_number (c);
1009 }
1010
1011 #ifdef emacs
1012
1013 void
1014 syms_of_character (void)
1015 {
1016   DEFSYM (Qcharacterp, "characterp");
1017   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1018
1019   staticpro (&Vchar_unify_table);
1020   Vchar_unify_table = Qnil;
1021
1022   defsubr (&Smax_char);
1023   defsubr (&Scharacterp);
1024   defsubr (&Sunibyte_char_to_multibyte);
1025   defsubr (&Smultibyte_char_to_unibyte);
1026   defsubr (&Schar_width);
1027   defsubr (&Sstring_width);
1028   defsubr (&Sstring);
1029   defsubr (&Sunibyte_string);
1030   defsubr (&Schar_resolve_modifiers);
1031   defsubr (&Sget_byte);
1032
1033   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1034                doc: /*
1035 Vector recording all translation tables ever defined.
1036 Each element is a pair (SYMBOL . TABLE) relating the table to the
1037 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1038   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1039
1040   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1041                doc: /*
1042 A char-table for characters which invoke auto-filling.
1043 Such characters have value t in this table.  */);
1044   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1045   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1046   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1047
1048   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1049                doc: /*
1050 A char-table for width (columns) of each character.  */);
1051   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1052   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1053   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1054                         make_number (4));
1055
1056   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1057                doc: /* A char-table for each printable character.  */);
1058   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1059   Fset_char_table_range (Vprintable_chars,
1060                          Fcons (make_number (32), make_number (126)), Qt);
1061   Fset_char_table_range (Vprintable_chars,
1062                          Fcons (make_number (160),
1063                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1064
1065   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1066                doc: /* Char table of script symbols.
1067 It has one extra slot whose value is a list of script symbols.  */);
1068
1069   /* Intern this now in case it isn't already done.
1070      Setting this variable twice is harmless.
1071      But don't staticpro it here--that is done in alloc.c.  */
1072   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1073   DEFSYM (Qchar_script_table, "char-script-table");
1074   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1075   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1076
1077   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1078                doc: /* Alist of scripts vs the representative characters.
1079 Each element is a cons (SCRIPT . CHARS).
1080 SCRIPT is a symbol representing a script or a subgroup of a script.
1081 CHARS is a list or a vector of characters.
1082 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1083 If it is a vector, one of the characters in the vector is necessary.
1084 This variable is used to find a font for a specific script.  */);
1085   Vscript_representative_chars = Qnil;
1086
1087   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1088                doc: /* Char table of Unicode's "General Category".
1089 All Unicode characters have one of the following values (symbol):
1090   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1091   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1092 See The Unicode Standard for the meaning of those values.  */);
1093   /* The correct char-table is setup in characters.el.  */
1094   Vunicode_category_table = Qnil;
1095 }
1096
1097 #endif /* emacs */