src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2011  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 static Lisp_Object Qauto_fill_chars;
  54
  55 /* Char-table of information about which character to unify to which
  56    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  57 Lisp_Object Vchar_unify_table;
  58
  59 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  60 unsigned char *_fetch_multibyte_char_p;
  61
  62 static Lisp_Object Qchar_script_table;
  63
  64 \f
  65
  66 /* If character code C has modifier masks, reflect them to the
  67    character code if possible.  Return the resulting code.  */
  68
  69 int
  70 char_resolve_modifier_mask (int c)
  71 {
  72   /* A non-ASCII character can't reflect modifier bits to the code.  */
  73   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  74     return c;
  75
  76   /* For Meta, Shift, and Control modifiers, we need special care.  */
  77   if (c & CHAR_SHIFT)
  78     {
  79       /* Shift modifier is valid only with [A-Za-z].  */
  80       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  81         c &= ~CHAR_SHIFT;
  82       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  83         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  84       /* Shift modifier for control characters and SPC is ignored.  */
  85       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  86         c &= ~CHAR_SHIFT;
  87     }
  88   if (c & CHAR_CTL)
  89     {
  90       /* Simulate the code in lread.c.  */
  91       /* Allow `\C- ' and `\C-?'.  */
  92       if ((c & 0377) == ' ')
  93         c &= ~0177 & ~ CHAR_CTL;
  94       else if ((c & 0377) == '?')
  95         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  96       /* ASCII control chars are made from letters (both cases),
  97          as well as the non-letters within 0100...0137.  */
  98       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 101         c &= (037 | (~0177 & ~CHAR_CTL));
 102     }
 103 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 104   if (c & CHAR_META)
 105     {
 106       /* Move the meta bit to the right place for a string.  */
 107       c = (c & ~CHAR_META) | 0x80;
 108     }
 109 #endif
 110
 111   return c;
 112 }
 113
 114
 115 /* Store multibyte form of character C at P.  If C has modifier bits,
 116    handle them appropriately.  */
 117
 118 int
 119 char_string (unsigned int c, unsigned char *p)
 120 {
 121   int bytes;
 122
 123   if (c & CHAR_MODIFIER_MASK)
 124     {
 125       c = (unsigned) char_resolve_modifier_mask ((int) c);
 126       /* If C still has any modifier bits, just ignore it.  */
 127       c &= ~CHAR_MODIFIER_MASK;
 128     }
 129
 130   MAYBE_UNIFY_CHAR (c);
 131
 132   if (c <= MAX_3_BYTE_CHAR)
 133     {
 134       bytes = CHAR_STRING (c, p);
 135     }
 136   else if (c <= MAX_4_BYTE_CHAR)
 137     {
 138       p[0] = (0xF0 | (c >> 18));
 139       p[1] = (0x80 | ((c >> 12) & 0x3F));
 140       p[2] = (0x80 | ((c >> 6) & 0x3F));
 141       p[3] = (0x80 | (c & 0x3F));
 142       bytes = 4;
 143     }
 144   else if (c <= MAX_5_BYTE_CHAR)
 145     {
 146       p[0] = 0xF8;
 147       p[1] = (0x80 | ((c >> 18) & 0x0F));
 148       p[2] = (0x80 | ((c >> 12) & 0x3F));
 149       p[3] = (0x80 | ((c >> 6) & 0x3F));
 150       p[4] = (0x80 | (c & 0x3F));
 151       bytes = 5;
 152     }
 153   else if (c <= MAX_CHAR)
 154     {
 155       c = CHAR_TO_BYTE8 (c);
 156       bytes = BYTE8_STRING (c, p);
 157     }
 158   else
 159     error ("Invalid character: %x", c);
 160
 161   return bytes;
 162 }
 163
 164
 165 /* Return a character whose multibyte form is at P.  If LEN is not
 166    NULL, it must be a pointer to integer.  In that case, set *LEN to
 167    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 168    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 169    the ending address (i.e., the starting address of the next
 170    character) of the multibyte form.  */
 171
 172 int
 173 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 174 {
 175   int c;
 176   const unsigned char *saved_p = p;
 177
 178   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 179     {
 180       c = STRING_CHAR_ADVANCE (p);
 181     }
 182   else if (! (*p & 0x08))
 183     {
 184       c = ((((p)[0] & 0xF) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       c = ((((p)[1] & 0x3F) << 18)
 193            | (((p)[2] & 0x3F) << 12)
 194            | (((p)[3] & 0x3F) << 6)
 195            | ((p)[4] & 0x3F));
 196       p += 5;
 197     }
 198
 199   MAYBE_UNIFY_CHAR (c);
 200
 201   if (len)
 202     *len = p - saved_p;
 203   if (advanced)
 204     *advanced = p;
 205   return c;
 206 }
 207
 208
 209 /* Translate character C by translation table TABLE.  If no translation is
 210    found in TABLE, return the untranslated character.  If TABLE is a list,
 211    elements are char tables.  In that case, recursively translate C by all the
 212    tables in the list.  */
 213
 214 int
 215 translate_char (Lisp_Object table, int c)
 216 {
 217   if (CHAR_TABLE_P (table))
 218     {
 219       Lisp_Object ch;
 220
 221       ch = CHAR_TABLE_REF (table, c);
 222       if (CHARACTERP (ch))
 223         c = XINT (ch);
 224     }
 225   else
 226     {
 227       for (; CONSP (table); table = XCDR (table))
 228         c = translate_char (XCAR (table), c);
 229     }
 230   return c;
 231 }
 232
 233 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 234    them, return (C & 0xFF).  */
 235
 236 int
 237 multibyte_char_to_unibyte (int c)
 238 {
 239   if (c < 0x80)
 240     return c;
 241   if (CHAR_BYTE8_P (c))
 242     return CHAR_TO_BYTE8 (c);
 243   return (c & 0xFF);
 244 }
 245
 246 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 247    by charset_unibyte.  */
 248
 249 int
 250 multibyte_char_to_unibyte_safe (int c)
 251 {
 252   if (c < 0x80)
 253     return c;
 254   if (CHAR_BYTE8_P (c))
 255     return CHAR_TO_BYTE8 (c);
 256   return -1;
 257 }
 258
 259 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 260        doc: /* Return non-nil if OBJECT is a character.  */)
 261   (Lisp_Object object, Lisp_Object ignore)
 262 {
 263   return (CHARACTERP (object) ? Qt : Qnil);
 264 }
 265
 266 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 267        doc: /* Return the character of the maximum code.  */)
 268   (void)
 269 {
 270   return make_number (MAX_CHAR);
 271 }
 272
 273 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 274        Sunibyte_char_to_multibyte, 1, 1, 0,
 275        doc: /* Convert the byte CH to multibyte character.  */)
 276   (Lisp_Object ch)
 277 {
 278   int c;
 279
 280   CHECK_CHARACTER (ch);
 281   c = XFASTINT (ch);
 282   if (c >= 0x100)
 283     error ("Not a unibyte character: %d", c);
 284   MAKE_CHAR_MULTIBYTE (c);
 285   return make_number (c);
 286 }
 287
 288 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 289        Smultibyte_char_to_unibyte, 1, 1, 0,
 290        doc: /* Convert the multibyte character CH to a byte.
 291 If the multibyte character does not represent a byte, return -1.  */)
 292   (Lisp_Object ch)
 293 {
 294   int cm;
 295
 296   CHECK_CHARACTER (ch);
 297   cm = XFASTINT (ch);
 298   if (cm < 256)
 299     /* Can't distinguish a byte read from a unibyte buffer from
 300        a latin1 char, so let's let it slide.  */
 301     return ch;
 302   else
 303     {
 304       int cu = CHAR_TO_BYTE_SAFE (cm);
 305       return make_number (cu);
 306     }
 307 }
 308
 309 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 310        doc: /* Return width of CHAR when displayed in the current buffer.
 311 The width is measured by how many columns it occupies on the screen.
 312 Tab is taken to occupy `tab-width' columns.
 313 usage: (char-width CHAR)  */)
 314   (Lisp_Object ch)
 315 {
 316   Lisp_Object disp;
 317   int c, width;
 318   struct Lisp_Char_Table *dp = buffer_display_table ();
 319
 320   CHECK_CHARACTER (ch);
 321   c = XINT (ch);
 322
 323   /* Get the way the display table would display it.  */
 324   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 325
 326   if (VECTORP (disp))
 327     width = ASIZE (disp);
 328   else
 329     width = CHAR_WIDTH (c);
 330
 331   return make_number (width);
 332 }
 333
 334 /* Return width of string STR of length LEN when displayed in the
 335    current buffer.  The width is measured by how many columns it
 336    occupies on the screen.  If PRECISION > 0, return the width of
 337    longest substring that doesn't exceed PRECISION, and set number of
 338    characters and bytes of the substring in *NCHARS and *NBYTES
 339    respectively.  */
 340
 341 EMACS_INT
 342 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 343                 EMACS_INT *nchars, EMACS_INT *nbytes)
 344 {
 345   EMACS_INT i = 0, i_byte = 0;
 346   EMACS_INT width = 0;
 347   struct Lisp_Char_Table *dp = buffer_display_table ();
 348
 349   while (i_byte < len)
 350     {
 351       int bytes, thiswidth;
 352       Lisp_Object val;
 353       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 354
 355       if (dp)
 356         {
 357           val = DISP_CHAR_VECTOR (dp, c);
 358           if (VECTORP (val))
 359             thiswidth = XVECTOR (val)->size;
 360           else
 361             thiswidth = CHAR_WIDTH (c);
 362         }
 363       else
 364         {
 365           thiswidth = CHAR_WIDTH (c);
 366         }
 367
 368       if (precision > 0
 369           && (width + thiswidth > precision))
 370         {
 371           *nchars = i;
 372           *nbytes = i_byte;
 373           return width;
 374         }
 375       i++;
 376       i_byte += bytes;
 377       width += thiswidth;
 378   }
 379
 380   if (precision > 0)
 381     {
 382       *nchars = i;
 383       *nbytes = i_byte;
 384     }
 385
 386   return width;
 387 }
 388
 389 /* Return width of string STR of length LEN when displayed in the
 390    current buffer.  The width is measured by how many columns it
 391    occupies on the screen.  */
 392
 393 EMACS_INT
 394 strwidth (const char *str, EMACS_INT len)
 395 {
 396   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 397 }
 398
 399 /* Return width of Lisp string STRING when displayed in the current
 400    buffer.  The width is measured by how many columns it occupies on
 401    the screen while paying attention to compositions.  If PRECISION >
 402    0, return the width of longest substring that doesn't exceed
 403    PRECISION, and set number of characters and bytes of the substring
 404    in *NCHARS and *NBYTES respectively.  */
 405
 406 EMACS_INT
 407 lisp_string_width (Lisp_Object string, int precision,
 408                    EMACS_INT *nchars, EMACS_INT *nbytes)
 409 {
 410   EMACS_INT len = SCHARS (string);
 411   /* This set multibyte to 0 even if STRING is multibyte when it
 412      contains only ascii and eight-bit-graphic, but that's
 413      intentional.  */
 414   int multibyte = len < SBYTES (string);
 415   unsigned char *str = SDATA (string);
 416   EMACS_INT i = 0, i_byte = 0;
 417   EMACS_INT width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i < len)
 421     {
 422       int chars, bytes, thiswidth;
 423       Lisp_Object val;
 424       int cmp_id;
 425       EMACS_INT ignore, end;
 426
 427       if (find_composition (i, -1, &ignore, &end, &val, string)
 428           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 429               >= 0))
 430         {
 431           thiswidth = composition_table[cmp_id]->width;
 432           chars = end - i;
 433           bytes = string_char_to_byte (string, end) - i_byte;
 434         }
 435       else
 436         {
 437           int c;
 438
 439           if (multibyte)
 440             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 441           else
 442             c = str[i_byte], bytes = 1;
 443           chars = 1;
 444           if (dp)
 445             {
 446               val = DISP_CHAR_VECTOR (dp, c);
 447               if (VECTORP (val))
 448                 thiswidth = XVECTOR (val)->size;
 449               else
 450                 thiswidth = CHAR_WIDTH (c);
 451             }
 452           else
 453             {
 454               thiswidth = CHAR_WIDTH (c);
 455             }
 456         }
 457
 458       if (precision > 0
 459           && (width + thiswidth > precision))
 460         {
 461           *nchars = i;
 462           *nbytes = i_byte;
 463           return width;
 464         }
 465       i += chars;
 466       i_byte += bytes;
 467       width += thiswidth;
 468   }
 469
 470   if (precision > 0)
 471     {
 472       *nchars = i;
 473       *nbytes = i_byte;
 474     }
 475
 476   return width;
 477 }
 478
 479 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 480        doc: /* Return width of STRING when displayed in the current buffer.
 481 Width is measured by how many columns it occupies on the screen.
 482 When calculating width of a multibyte character in STRING,
 483 only the base leading-code is considered; the validity of
 484 the following bytes is not checked.  Tabs in STRING are always
 485 taken to occupy `tab-width' columns.
 486 usage: (string-width STRING)  */)
 487   (Lisp_Object str)
 488 {
 489   Lisp_Object val;
 490
 491   CHECK_STRING (str);
 492   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 493   return val;
 494 }
 495
 496 /* Return the number of characters in the NBYTES bytes at PTR.
 497    This works by looking at the contents and checking for multibyte
 498    sequences while assuming that there's no invalid sequence.
 499    However, if the current buffer has enable-multibyte-characters =
 500    nil, we treat each byte as a character.  */
 501
 502 EMACS_INT
 503 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 504 {
 505   /* current_buffer is null at early stages of Emacs initialization.  */
 506   if (current_buffer == 0
 507       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 508     return nbytes;
 509
 510   return multibyte_chars_in_text (ptr, nbytes);
 511 }
 512
 513 /* Return the number of characters in the NBYTES bytes at PTR.
 514    This works by looking at the contents and checking for multibyte
 515    sequences while assuming that there's no invalid sequence.  It
 516    ignores enable-multibyte-characters.  */
 517
 518 EMACS_INT
 519 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 520 {
 521   const unsigned char *endp = ptr + nbytes;
 522   EMACS_INT chars = 0;
 523
 524   while (ptr < endp)
 525     {
 526       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 527
 528       if (len == 0)
 529         abort ();
 530       ptr += len;
 531       chars++;
 532     }
 533
 534   return chars;
 535 }
 536
 537 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 538    characters and bytes in it, and store them in *NCHARS and *NBYTES
 539    respectively.  On counting bytes, pay attention to that 8-bit
 540    characters not constructing a valid multibyte sequence are
 541    represented by 2-byte in a multibyte text.  */
 542
 543 void
 544 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 545                         EMACS_INT *nchars, EMACS_INT *nbytes)
 546 {
 547   const unsigned char *endp = str + len;
 548   EMACS_INT n, chars = 0, bytes = 0;
 549
 550   if (len >= MAX_MULTIBYTE_LENGTH)
 551     {
 552       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 553       while (str < adjusted_endp)
 554         {
 555           if (! CHAR_BYTE8_HEAD_P (*str)
 556               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 557             str += n, bytes += n;
 558           else
 559             str++, bytes += 2;
 560           chars++;
 561         }
 562     }
 563   while (str < endp)
 564     {
 565       if (! CHAR_BYTE8_HEAD_P (*str)
 566           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 567         str += n, bytes += n;
 568       else
 569         str++, bytes += 2;
 570       chars++;
 571     }
 572
 573   *nchars = chars;
 574   *nbytes = bytes;
 575   return;
 576 }
 577
 578 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 579    It actually converts only such 8-bit characters that don't contruct
 580    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 581    NCHARS is nonzero, set *NCHARS to the number of characters in the
 582    text.  It is assured that we can use LEN bytes at STR as a work
 583    area and that is enough.  Return the number of bytes of the
 584    resulting text.  */
 585
 586 EMACS_INT
 587 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 588                   EMACS_INT *nchars)
 589 {
 590   unsigned char *p = str, *endp = str + nbytes;
 591   unsigned char *to;
 592   EMACS_INT chars = 0;
 593   int n;
 594
 595   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 596     {
 597       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 598       while (p < adjusted_endp
 599              && ! CHAR_BYTE8_HEAD_P (*p)
 600              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 601         p += n, chars++;
 602     }
 603   while (p < endp
 604          && ! CHAR_BYTE8_HEAD_P (*p)
 605          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 606     p += n, chars++;
 607   if (nchars)
 608     *nchars = chars;
 609   if (p == endp)
 610     return nbytes;
 611
 612   to = p;
 613   nbytes = endp - p;
 614   endp = str + len;
 615   memmove (endp - nbytes, p, nbytes);
 616   p = endp - nbytes;
 617
 618   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 619     {
 620       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 621       while (p < adjusted_endp)
 622         {
 623           if (! CHAR_BYTE8_HEAD_P (*p)
 624               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 625             {
 626               while (n--)
 627                 *to++ = *p++;
 628             }
 629           else
 630             {
 631               int c = *p++;
 632               c = BYTE8_TO_CHAR (c);
 633               to += CHAR_STRING (c, to);
 634             }
 635         }
 636       chars++;
 637     }
 638   while (p < endp)
 639     {
 640       if (! CHAR_BYTE8_HEAD_P (*p)
 641           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 642         {
 643           while (n--)
 644             *to++ = *p++;
 645         }
 646       else
 647         {
 648           int c = *p++;
 649           c = BYTE8_TO_CHAR (c);
 650           to += CHAR_STRING (c, to);
 651         }
 652       chars++;
 653     }
 654   if (nchars)
 655     *nchars = chars;
 656   return (to - str);
 657 }
 658
 659 /* Parse unibyte string at STR of LEN bytes, and return the number of
 660    bytes it may ocupy when converted to multibyte string by
 661    `str_to_multibyte'.  */
 662
 663 EMACS_INT
 664 parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
 665 {
 666   const unsigned char *endp = str + len;
 667   EMACS_INT bytes;
 668
 669   for (bytes = 0; str < endp; str++)
 670     bytes += (*str < 0x80) ? 1 : 2;
 671   return bytes;
 672 }
 673
 674
 675 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 676    that contains the same single-byte characters.  It actually
 677    converts all 8-bit characters to multibyte forms.  It is assured
 678    that we can use LEN bytes at STR as a work area and that is
 679    enough.  */
 680
 681 EMACS_INT
 682 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 683 {
 684   unsigned char *p = str, *endp = str + bytes;
 685   unsigned char *to;
 686
 687   while (p < endp && *p < 0x80) p++;
 688   if (p == endp)
 689     return bytes;
 690   to = p;
 691   bytes = endp - p;
 692   endp = str + len;
 693   memmove (endp - bytes, p, bytes);
 694   p = endp - bytes;
 695   while (p < endp)
 696     {
 697       int c = *p++;
 698
 699       if (c >= 0x80)
 700         c = BYTE8_TO_CHAR (c);
 701       to += CHAR_STRING (c, to);
 702     }
 703   return (to - str);
 704 }
 705
 706 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 707    actually converts characters in the range 0x80..0xFF to
 708    unibyte.  */
 709
 710 EMACS_INT
 711 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 712 {
 713   const unsigned char *p = str, *endp = str + bytes;
 714   unsigned char *to;
 715   int c, len;
 716
 717   while (p < endp)
 718     {
 719       c = *p;
 720       len = BYTES_BY_CHAR_HEAD (c);
 721       if (CHAR_BYTE8_HEAD_P (c))
 722         break;
 723       p += len;
 724     }
 725   to = str + (p - str);
 726   while (p < endp)
 727     {
 728       c = *p;
 729       len = BYTES_BY_CHAR_HEAD (c);
 730       if (CHAR_BYTE8_HEAD_P (c))
 731         {
 732           c = STRING_CHAR_ADVANCE (p);
 733           *to++ = CHAR_TO_BYTE8 (c);
 734         }
 735       else
 736         {
 737           while (len--) *to++ = *p++;
 738         }
 739     }
 740   return (to - str);
 741 }
 742
 743 /* Convert eight-bit chars in SRC (in multibyte form) to the
 744    corresponding byte and store in DST.  CHARS is the number of
 745    characters in SRC.  The value is the number of bytes stored in DST.
 746    Usually, the value is the same as CHARS, but is less than it if SRC
 747    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 748    is nonzero, a Latin-1 character is accepted and converted to a byte
 749    of that character code.
 750    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 751
 752 EMACS_INT
 753 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 754 {
 755   EMACS_INT i;
 756
 757   for (i = 0; i < chars; i++)
 758     {
 759       int c = STRING_CHAR_ADVANCE (src);
 760
 761       if (CHAR_BYTE8_P (c))
 762         c = CHAR_TO_BYTE8 (c);
 763       else if (! ASCII_CHAR_P (c)
 764                && (! accept_latin_1 || c >= 0x100))
 765         return i;
 766       *dst++ = c;
 767     }
 768   return i;
 769 }
 770
 771
 772 static EMACS_INT
 773 string_count_byte8 (Lisp_Object string)
 774 {
 775   int multibyte = STRING_MULTIBYTE (string);
 776   EMACS_INT nbytes = SBYTES (string);
 777   unsigned char *p = SDATA (string);
 778   unsigned char *pend = p + nbytes;
 779   EMACS_INT count = 0;
 780   int c, len;
 781
 782   if (multibyte)
 783     while (p < pend)
 784       {
 785         c = *p;
 786         len = BYTES_BY_CHAR_HEAD (c);
 787
 788         if (CHAR_BYTE8_HEAD_P (c))
 789           count++;
 790         p += len;
 791       }
 792   else
 793     while (p < pend)
 794       {
 795         if (*p++ >= 0x80)
 796           count++;
 797       }
 798   return count;
 799 }
 800
 801
 802 Lisp_Object
 803 string_escape_byte8 (Lisp_Object string)
 804 {
 805   EMACS_INT nchars = SCHARS (string);
 806   EMACS_INT nbytes = SBYTES (string);
 807   int multibyte = STRING_MULTIBYTE (string);
 808   EMACS_INT byte8_count;
 809   const unsigned char *src, *src_end;
 810   unsigned char *dst;
 811   Lisp_Object val;
 812   int c, len;
 813
 814   if (multibyte && nchars == nbytes)
 815     return string;
 816
 817   byte8_count = string_count_byte8 (string);
 818
 819   if (byte8_count == 0)
 820     return string;
 821
 822   if (multibyte)
 823     {
 824       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 825           || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
 826         error ("Maximum string size exceeded");
 827
 828       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 829       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 830                                           nbytes + byte8_count * 2);
 831     }
 832   else
 833     {
 834       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
 835         error ("Maximum string size exceeded");
 836       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 837       val = make_uninit_string (nbytes + byte8_count * 3);
 838     }
 839
 840   src = SDATA (string);
 841   src_end = src + nbytes;
 842   dst = SDATA (val);
 843   if (multibyte)
 844     while (src < src_end)
 845       {
 846         c = *src;
 847         len = BYTES_BY_CHAR_HEAD (c);
 848
 849         if (CHAR_BYTE8_HEAD_P (c))
 850           {
 851             c = STRING_CHAR_ADVANCE (src);
 852             c = CHAR_TO_BYTE8 (c);
 853             sprintf ((char *) dst, "\\%03o", c);
 854             dst += 4;
 855           }
 856         else
 857           while (len--) *dst++ = *src++;
 858       }
 859   else
 860     while (src < src_end)
 861       {
 862         c = *src++;
 863         if (c >= 0x80)
 864           {
 865             sprintf ((char *) dst, "\\%03o", c);
 866             dst += 4;
 867           }
 868         else
 869           *dst++ = c;
 870       }
 871   return val;
 872 }
 873
 874 \f
 875 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 876        doc: /*
 877 Concatenate all the argument characters and make the result a string.
 878 usage: (string &rest CHARACTERS)  */)
 879   (size_t n, Lisp_Object *args)
 880 {
 881   size_t i;
 882   int c;
 883   unsigned char *buf, *p;
 884   Lisp_Object str;
 885   USE_SAFE_ALLOCA;
 886
 887   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 888   p = buf;
 889
 890   for (i = 0; i < n; i++)
 891     {
 892       CHECK_CHARACTER (args[i]);
 893       c = XINT (args[i]);
 894       p += CHAR_STRING (c, p);
 895     }
 896
 897   str = make_string_from_bytes ((char *) buf, n, p - buf);
 898   SAFE_FREE ();
 899   return str;
 900 }
 901
 902 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 903        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 904 usage: (unibyte-string &rest BYTES)  */)
 905   (size_t n, Lisp_Object *args)
 906 {
 907   size_t i;
 908   int c;
 909   unsigned char *buf, *p;
 910   Lisp_Object str;
 911   USE_SAFE_ALLOCA;
 912
 913   SAFE_ALLOCA (buf, unsigned char *, n);
 914   p = buf;
 915
 916   for (i = 0; i < n; i++)
 917     {
 918       CHECK_NATNUM (args[i]);
 919       c = XFASTINT (args[i]);
 920       if (c >= 256)
 921         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 922       *p++ = c;
 923     }
 924
 925   str = make_string_from_bytes ((char *) buf, n, p - buf);
 926   SAFE_FREE ();
 927   return str;
 928 }
 929
 930 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 931        Schar_resolve_modifiers, 1, 1, 0,
 932        doc: /* Resolve modifiers in the character CHAR.
 933 The value is a character with modifiers resolved into the character
 934 code.  Unresolved modifiers are kept in the value.
 935 usage: (char-resolve-modifiers CHAR)  */)
 936   (Lisp_Object character)
 937 {
 938   int c;
 939
 940   CHECK_NUMBER (character);
 941   c = XINT (character);
 942   return make_number (char_resolve_modifier_mask (c));
 943 }
 944
 945 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 946        doc: /* Return a byte value of a character at point.
 947 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 948 a byte value.
 949 Optional 2nd arg STRING, if non-nil, is a string of which first
 950 character is a target to get a byte value.  In this case, POSITION, if
 951 non-nil, is an index of a target character in the string.
 952
 953 If the current buffer (or STRING) is multibyte, and the target
 954 character is not ASCII nor 8-bit character, an error is signalled.  */)
 955   (Lisp_Object position, Lisp_Object string)
 956 {
 957   int c;
 958   EMACS_INT pos;
 959   unsigned char *p;
 960
 961   if (NILP (string))
 962     {
 963       if (NILP (position))
 964         {
 965           p = PT_ADDR;
 966         }
 967       else
 968         {
 969           CHECK_NUMBER_COERCE_MARKER (position);
 970           if (XINT (position) < BEGV || XINT (position) >= ZV)
 971             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 972           pos = XFASTINT (position);
 973           p = CHAR_POS_ADDR (pos);
 974         }
 975       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 976         return make_number (*p);
 977     }
 978   else
 979     {
 980       CHECK_STRING (string);
 981       if (NILP (position))
 982         {
 983           p = SDATA (string);
 984         }
 985       else
 986         {
 987           CHECK_NATNUM (position);
 988           if (XINT (position) >= SCHARS (string))
 989             args_out_of_range (string, position);
 990           pos = XFASTINT (position);
 991           p = SDATA (string) + string_char_to_byte (string, pos);
 992         }
 993       if (! STRING_MULTIBYTE (string))
 994         return make_number (*p);
 995     }
 996   c = STRING_CHAR (p);
 997   if (CHAR_BYTE8_P (c))
 998     c = CHAR_TO_BYTE8 (c);
 999   else if (! ASCII_CHAR_P (c))
1000     error ("Not an ASCII nor an 8-bit character: %d", c);
1001   return make_number (c);
1002 }
1003
1004
1005 void
1006 init_character_once (void)
1007 {
1008 }
1009
1010 #ifdef emacs
1011
1012 void
1013 syms_of_character (void)
1014 {
1015   DEFSYM (Qcharacterp, "characterp");
1016   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1017
1018   staticpro (&Vchar_unify_table);
1019   Vchar_unify_table = Qnil;
1020
1021   defsubr (&Smax_char);
1022   defsubr (&Scharacterp);
1023   defsubr (&Sunibyte_char_to_multibyte);
1024   defsubr (&Smultibyte_char_to_unibyte);
1025   defsubr (&Schar_width);
1026   defsubr (&Sstring_width);
1027   defsubr (&Sstring);
1028   defsubr (&Sunibyte_string);
1029   defsubr (&Schar_resolve_modifiers);
1030   defsubr (&Sget_byte);
1031
1032   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1033                doc: /*
1034 Vector recording all translation tables ever defined.
1035 Each element is a pair (SYMBOL . TABLE) relating the table to the
1036 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1037   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1038
1039   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1040                doc: /*
1041 A char-table for characters which invoke auto-filling.
1042 Such characters have value t in this table.  */);
1043   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1044   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1045   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1046
1047   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1048                doc: /*
1049 A char-table for width (columns) of each character.  */);
1050   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1051   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1052   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1053                         make_number (4));
1054
1055   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1056                doc: /* A char-table for each printable character.  */);
1057   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1058   Fset_char_table_range (Vprintable_chars,
1059                          Fcons (make_number (32), make_number (126)), Qt);
1060   Fset_char_table_range (Vprintable_chars,
1061                          Fcons (make_number (160),
1062                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1063
1064   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1065                doc: /* Char table of script symbols.
1066 It has one extra slot whose value is a list of script symbols.  */);
1067
1068   /* Intern this now in case it isn't already done.
1069      Setting this variable twice is harmless.
1070      But don't staticpro it here--that is done in alloc.c.  */
1071   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1072   DEFSYM (Qchar_script_table, "char-script-table");
1073   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1074   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1075
1076   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1077                doc: /* Alist of scripts vs the representative characters.
1078 Each element is a cons (SCRIPT . CHARS).
1079 SCRIPT is a symbol representing a script or a subgroup of a script.
1080 CHARS is a list or a vector of characters.
1081 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1082 If it is a vector, one of the characters in the vector is necessary.
1083 This variable is used to find a font for a specific script.  */);
1084   Vscript_representative_chars = Qnil;
1085
1086   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1087                doc: /* Char table of Unicode's "General Category".
1088 All Unicode characters have one of the following values (symbol):
1089   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1090   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1091 See The Unicode Standard for the meaning of those values.  */);
1092   /* The correct char-table is setup in characters.el.  */
1093   Vunicode_category_table = Qnil;
1094 }
1095
1096 #endif /* emacs */