src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object impl_Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object impl_Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object impl_Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object impl_Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object impl_Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object impl_Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object impl_Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 Lisp_Object impl_Vunicode_category_table;
  90
  91 \f
  92
  93 /* If character code C has modifier masks, reflect them to the
  94    character code if possible.  Return the resulting code.  */
  95
  96 int
  97 char_resolve_modifier_mask (c)
  98      int c;
  99 {
 100   /* A non-ASCII character can't reflect modifier bits to the code.  */
 101   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 102     return c;
 103
 104   /* For Meta, Shift, and Control modifiers, we need special care.  */
 105   if (c & CHAR_SHIFT)
 106     {
 107       /* Shift modifier is valid only with [A-Za-z].  */
 108       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 109         c &= ~CHAR_SHIFT;
 110       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 111         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 112       /* Shift modifier for control characters and SPC is ignored.  */
 113       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 114         c &= ~CHAR_SHIFT;
 115     }
 116   if (c & CHAR_CTL)
 117     {
 118       /* Simulate the code in lread.c.  */
 119       /* Allow `\C- ' and `\C-?'.  */
 120       if ((c & 0377) == ' ')
 121         c &= ~0177 & ~ CHAR_CTL;
 122       else if ((c & 0377) == '?')
 123         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 124       /* ASCII control chars are made from letters (both cases),
 125          as well as the non-letters within 0100...0137.  */
 126       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 127         c &= (037 | (~0177 & ~CHAR_CTL));
 128       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 129         c &= (037 | (~0177 & ~CHAR_CTL));
 130     }
 131 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 132   if (c & CHAR_META)
 133     {
 134       /* Move the meta bit to the right place for a string.  */
 135       c = (c & ~CHAR_META) | 0x80;
 136     }
 137 #endif
 138
 139   return c;
 140 }
 141
 142
 143 /* Store multibyte form of character C at P.  If C has modifier bits,
 144    handle them appropriately.  */
 145
 146 int
 147 char_string (c, p)
 148      unsigned c;
 149      unsigned char *p;
 150 {
 151   int bytes;
 152
 153   if (c & CHAR_MODIFIER_MASK)
 154     {
 155       c = (unsigned) char_resolve_modifier_mask ((int) c);
 156       /* If C still has any modifier bits, just ignore it.  */
 157       c &= ~CHAR_MODIFIER_MASK;
 158     }
 159
 160   MAYBE_UNIFY_CHAR (c);
 161
 162   if (c <= MAX_3_BYTE_CHAR)
 163     {
 164       bytes = CHAR_STRING (c, p);
 165     }
 166   else if (c <= MAX_4_BYTE_CHAR)
 167     {
 168       p[0] = (0xF0 | (c >> 18));
 169       p[1] = (0x80 | ((c >> 12) & 0x3F));
 170       p[2] = (0x80 | ((c >> 6) & 0x3F));
 171       p[3] = (0x80 | (c & 0x3F));
 172       bytes = 4;
 173     }
 174   else if (c <= MAX_5_BYTE_CHAR)
 175     {
 176       p[0] = 0xF8;
 177       p[1] = (0x80 | ((c >> 18) & 0x0F));
 178       p[2] = (0x80 | ((c >> 12) & 0x3F));
 179       p[3] = (0x80 | ((c >> 6) & 0x3F));
 180       p[4] = (0x80 | (c & 0x3F));
 181       bytes = 5;
 182     }
 183   else if (c <= MAX_CHAR)
 184     {
 185       c = CHAR_TO_BYTE8 (c);
 186       bytes = BYTE8_STRING (c, p);
 187     }
 188   else
 189     error ("Invalid character: %d", c);
 190
 191   return bytes;
 192 }
 193
 194
 195 /* Return a character whose multibyte form is at P.  Set LEN is not
 196    NULL, it must be a pointer to integer.  In that case, set *LEN to
 197    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 198    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 199    the ending address (i.e. the starting address of the next
 200    character) of the multibyte form.  */
 201
 202 int
 203 string_char (p, advanced, len)
 204      const unsigned char *p;
 205      const unsigned char **advanced;
 206      int *len;
 207 {
 208   int c;
 209   const unsigned char *saved_p = p;
 210
 211   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 212     {
 213       c = STRING_CHAR_ADVANCE (p);
 214     }
 215   else if (! (*p & 0x08))
 216     {
 217       c = ((((p)[0] & 0xF) << 18)
 218            | (((p)[1] & 0x3F) << 12)
 219            | (((p)[2] & 0x3F) << 6)
 220            | ((p)[3] & 0x3F));
 221       p += 4;
 222     }
 223   else
 224     {
 225       c = ((((p)[1] & 0x3F) << 18)
 226            | (((p)[2] & 0x3F) << 12)
 227            | (((p)[3] & 0x3F) << 6)
 228            | ((p)[4] & 0x3F));
 229       p += 5;
 230     }
 231
 232   MAYBE_UNIFY_CHAR (c);
 233
 234   if (len)
 235     *len = p - saved_p;
 236   if (advanced)
 237     *advanced = p;
 238   return c;
 239 }
 240
 241
 242 /* Translate character C by translation table TABLE.  If C is
 243    negative, translate a character specified by CHARSET and CODE.  If
 244    no translation is found in TABLE, return the untranslated
 245    character.  If TABLE is a list, elements are char tables.  In this
 246    case, translace C by all tables.  */
 247
 248 int
 249 translate_char (table, c)
 250      Lisp_Object table;
 251      int c;
 252 {
 253   if (CHAR_TABLE_P (table))
 254     {
 255       Lisp_Object ch;
 256
 257       ch = CHAR_TABLE_REF (table, c);
 258       if (CHARACTERP (ch))
 259         c = XINT (ch);
 260     }
 261   else
 262     {
 263       for (; CONSP (table); table = XCDR (table))
 264         c = translate_char (XCAR (table), c);
 265     }
 266   return c;
 267 }
 268
 269 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 270    them, return (C & 0xFF).
 271
 272    The argument REV_TBL is now ignored.  It will be removed in the
 273    future.  */
 274
 275 int
 276 multibyte_char_to_unibyte (c, rev_tbl)
 277      int c;
 278      Lisp_Object rev_tbl;
 279 {
 280   if (c < 0x80)
 281     return c;
 282   if (CHAR_BYTE8_P (c))
 283     return CHAR_TO_BYTE8 (c);
 284   return (c & 0xFF);
 285 }
 286
 287 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 288    by charset_unibyte.  */
 289
 290 int
 291 multibyte_char_to_unibyte_safe (c)
 292      int c;
 293 {
 294   if (c < 0x80)
 295     return c;
 296   if (CHAR_BYTE8_P (c))
 297     return CHAR_TO_BYTE8 (c);
 298   return -1;
 299 }
 300
 301 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 302        doc: /* Return non-nil if OBJECT is a character.  */)
 303      (object, ignore)
 304      Lisp_Object object, ignore;
 305 {
 306   return (CHARACTERP (object) ? Qt : Qnil);
 307 }
 308
 309 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 310        doc: /* Return the character of the maximum code.  */)
 311      ()
 312 {
 313   return make_number (MAX_CHAR);
 314 }
 315
 316 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 317        Sunibyte_char_to_multibyte, 1, 1, 0,
 318        doc: /* Convert the byte CH to multibyte character.  */)
 319      (ch)
 320      Lisp_Object ch;
 321 {
 322   int c;
 323
 324   CHECK_CHARACTER (ch);
 325   c = XFASTINT (ch);
 326   if (c >= 0x100)
 327     error ("Not a unibyte character: %d", c);
 328   MAKE_CHAR_MULTIBYTE (c);
 329   return make_number (c);
 330 }
 331
 332 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 333        Smultibyte_char_to_unibyte, 1, 1, 0,
 334        doc: /* Convert the multibyte character CH to a byte.
 335 If the multibyte character does not represent a byte, return -1.  */)
 336      (ch)
 337      Lisp_Object ch;
 338 {
 339   int cm;
 340
 341   CHECK_CHARACTER (ch);
 342   cm = XFASTINT (ch);
 343   if (cm < 256)
 344     /* Can't distinguish a byte read from a unibyte buffer from
 345        a latin1 char, so let's let it slide.  */
 346     return ch;
 347   else
 348     {
 349       int cu = CHAR_TO_BYTE_SAFE (cm);
 350       return make_number (cu);
 351     }
 352 }
 353
 354 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 355        doc: /* Return 1 regardless of the argument CHAR.
 356 This is now an obsolete function.  We keep it just for backward compatibility.
 357 usage: (char-bytes CHAR)  */)
 358      (ch)
 359      Lisp_Object ch;
 360 {
 361   CHECK_CHARACTER (ch);
 362   return make_number (1);
 363 }
 364
 365 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 366        doc: /* Return width of CHAR when displayed in the current buffer.
 367 The width is measured by how many columns it occupies on the screen.
 368 Tab is taken to occupy `tab-width' columns.
 369 usage: (char-width CHAR)  */)
 370      (ch)
 371        Lisp_Object ch;
 372 {
 373   Lisp_Object disp;
 374   int c, width;
 375   struct Lisp_Char_Table *dp = buffer_display_table ();
 376
 377   CHECK_CHARACTER (ch);
 378   c = XINT (ch);
 379
 380   /* Get the way the display table would display it.  */
 381   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 382
 383   if (VECTORP (disp))
 384     width = ASIZE (disp);
 385   else
 386     width = CHAR_WIDTH (c);
 387
 388   return make_number (width);
 389 }
 390
 391 /* Return width of string STR of length LEN when displayed in the
 392    current buffer.  The width is measured by how many columns it
 393    occupies on the screen.  If PRECISION > 0, return the width of
 394    longest substring that doesn't exceed PRECISION, and set number of
 395    characters and bytes of the substring in *NCHARS and *NBYTES
 396    respectively.  */
 397
 398 int
 399 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 400 {
 401   int i = 0, i_byte = 0;
 402   int width = 0;
 403   struct Lisp_Char_Table *dp = buffer_display_table ();
 404
 405   while (i_byte < len)
 406     {
 407       int bytes, thiswidth;
 408       Lisp_Object val;
 409       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 410
 411       if (dp)
 412         {
 413           val = DISP_CHAR_VECTOR (dp, c);
 414           if (VECTORP (val))
 415             thiswidth = XVECTOR (val)->size;
 416           else
 417             thiswidth = CHAR_WIDTH (c);
 418         }
 419       else
 420         {
 421           thiswidth = CHAR_WIDTH (c);
 422         }
 423
 424       if (precision > 0
 425           && (width + thiswidth > precision))
 426         {
 427           *nchars = i;
 428           *nbytes = i_byte;
 429           return width;
 430         }
 431       i++;
 432       i_byte += bytes;
 433       width += thiswidth;
 434   }
 435
 436   if (precision > 0)
 437     {
 438       *nchars = i;
 439       *nbytes = i_byte;
 440     }
 441
 442   return width;
 443 }
 444
 445 /* Return width of string STR of length LEN when displayed in the
 446    current buffer.  The width is measured by how many columns it
 447    occupies on the screen.  */
 448
 449 int
 450 strwidth (str, len)
 451      unsigned char *str;
 452      int len;
 453 {
 454   return c_string_width (str, len, -1, NULL, NULL);
 455 }
 456
 457 /* Return width of Lisp string STRING when displayed in the current
 458    buffer.  The width is measured by how many columns it occupies on
 459    the screen while paying attention to compositions.  If PRECISION >
 460    0, return the width of longest substring that doesn't exceed
 461    PRECISION, and set number of characters and bytes of the substring
 462    in *NCHARS and *NBYTES respectively.  */
 463
 464 int
 465 lisp_string_width (string, precision, nchars, nbytes)
 466      Lisp_Object string;
 467      int precision, *nchars, *nbytes;
 468 {
 469   int len = SCHARS (string);
 470   /* This set multibyte to 0 even if STRING is multibyte when it
 471      contains only ascii and eight-bit-graphic, but that's
 472      intentional.  */
 473   int multibyte = len < SBYTES (string);
 474   unsigned char *str = SDATA (string);
 475   int i = 0, i_byte = 0;
 476   int width = 0;
 477   struct Lisp_Char_Table *dp = buffer_display_table ();
 478
 479   while (i < len)
 480     {
 481       int chars, bytes, thiswidth;
 482       Lisp_Object val;
 483       int cmp_id;
 484       EMACS_INT ignore, end;
 485
 486       if (find_composition (i, -1, &ignore, &end, &val, string)
 487           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 488               >= 0))
 489         {
 490           thiswidth = composition_table[cmp_id]->width;
 491           chars = end - i;
 492           bytes = string_char_to_byte (string, end) - i_byte;
 493         }
 494       else
 495         {
 496           int c;
 497
 498           if (multibyte)
 499             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 500           else
 501             c = str[i_byte], bytes = 1;
 502           chars = 1;
 503           if (dp)
 504             {
 505               val = DISP_CHAR_VECTOR (dp, c);
 506               if (VECTORP (val))
 507                 thiswidth = XVECTOR (val)->size;
 508               else
 509                 thiswidth = CHAR_WIDTH (c);
 510             }
 511           else
 512             {
 513               thiswidth = CHAR_WIDTH (c);
 514             }
 515         }
 516
 517       if (precision > 0
 518           && (width + thiswidth > precision))
 519         {
 520           *nchars = i;
 521           *nbytes = i_byte;
 522           return width;
 523         }
 524       i += chars;
 525       i_byte += bytes;
 526       width += thiswidth;
 527   }
 528
 529   if (precision > 0)
 530     {
 531       *nchars = i;
 532       *nbytes = i_byte;
 533     }
 534
 535   return width;
 536 }
 537
 538 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 539        doc: /* Return width of STRING when displayed in the current buffer.
 540 Width is measured by how many columns it occupies on the screen.
 541 When calculating width of a multibyte character in STRING,
 542 only the base leading-code is considered; the validity of
 543 the following bytes is not checked.  Tabs in STRING are always
 544 taken to occupy `tab-width' columns.
 545 usage: (string-width STRING)  */)
 546      (str)
 547      Lisp_Object str;
 548 {
 549   Lisp_Object val;
 550
 551   CHECK_STRING (str);
 552   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 553   return val;
 554 }
 555
 556 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 557        doc: /* Return the direction of CHAR.
 558 The returned value is 0 for left-to-right and 1 for right-to-left.
 559 usage: (char-direction CHAR)  */)
 560      (ch)
 561      Lisp_Object ch;
 562 {
 563   int c;
 564
 565   CHECK_CHARACTER (ch);
 566   c = XINT (ch);
 567   return CHAR_TABLE_REF (Vchar_direction_table, c);
 568 }
 569
 570 /* Return the number of characters in the NBYTES bytes at PTR.
 571    This works by looking at the contents and checking for multibyte
 572    sequences while assuming that there's no invalid sequence.
 573    However, if the current buffer has enable-multibyte-characters =
 574    nil, we treat each byte as a character.  */
 575
 576 EMACS_INT
 577 chars_in_text (ptr, nbytes)
 578      const unsigned char *ptr;
 579      EMACS_INT nbytes;
 580 {
 581   /* current_buffer is null at early stages of Emacs initialization.  */
 582   if (current_buffer == 0
 583       || NILP (BUF_ENABLE_MULTIBYTE_CHARACTERS (current_buffer)))
 584     return nbytes;
 585
 586   return multibyte_chars_in_text (ptr, nbytes);
 587 }
 588
 589 /* Return the number of characters in the NBYTES bytes at PTR.
 590    This works by looking at the contents and checking for multibyte
 591    sequences while assuming that there's no invalid sequence.  It
 592    ignores enable-multibyte-characters.  */
 593
 594 EMACS_INT
 595 multibyte_chars_in_text (ptr, nbytes)
 596      const unsigned char *ptr;
 597      EMACS_INT nbytes;
 598 {
 599   const unsigned char *endp = ptr + nbytes;
 600   int chars = 0;
 601
 602   while (ptr < endp)
 603     {
 604       int len = MULTIBYTE_LENGTH (ptr, endp);
 605
 606       if (len == 0)
 607         abort ();
 608       ptr += len;
 609       chars++;
 610     }
 611
 612   return chars;
 613 }
 614
 615 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 616    characters and bytes in it, and store them in *NCHARS and *NBYTES
 617    respectively.  On counting bytes, pay attention to that 8-bit
 618    characters not constructing a valid multibyte sequence are
 619    represented by 2-byte in a multibyte text.  */
 620
 621 void
 622 parse_str_as_multibyte (str, len, nchars, nbytes)
 623      const unsigned char *str;
 624      int len, *nchars, *nbytes;
 625 {
 626   const unsigned char *endp = str + len;
 627   int n, chars = 0, bytes = 0;
 628
 629   if (len >= MAX_MULTIBYTE_LENGTH)
 630     {
 631       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 632       while (str < adjusted_endp)
 633         {
 634           if (! CHAR_BYTE8_HEAD_P (*str)
 635               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 636             str += n, bytes += n;
 637           else
 638             str++, bytes += 2;
 639           chars++;
 640         }
 641     }
 642   while (str < endp)
 643     {
 644       if (! CHAR_BYTE8_HEAD_P (*str)
 645           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 646         str += n, bytes += n;
 647       else
 648         str++, bytes += 2;
 649       chars++;
 650     }
 651
 652   *nchars = chars;
 653   *nbytes = bytes;
 654   return;
 655 }
 656
 657 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 658    It actually converts only such 8-bit characters that don't contruct
 659    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 660    NCHARS is nonzero, set *NCHARS to the number of characters in the
 661    text.  It is assured that we can use LEN bytes at STR as a work
 662    area and that is enough.  Return the number of bytes of the
 663    resulting text.  */
 664
 665 int
 666 str_as_multibyte (str, len, nbytes, nchars)
 667      unsigned char *str;
 668      int len, nbytes, *nchars;
 669 {
 670   unsigned char *p = str, *endp = str + nbytes;
 671   unsigned char *to;
 672   int chars = 0;
 673   int n;
 674
 675   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 676     {
 677       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 678       while (p < adjusted_endp
 679              && ! CHAR_BYTE8_HEAD_P (*p)
 680              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 681         p += n, chars++;
 682     }
 683   while (p < endp
 684          && ! CHAR_BYTE8_HEAD_P (*p)
 685          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 686     p += n, chars++;
 687   if (nchars)
 688     *nchars = chars;
 689   if (p == endp)
 690     return nbytes;
 691
 692   to = p;
 693   nbytes = endp - p;
 694   endp = str + len;
 695   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 696   p = endp - nbytes;
 697
 698   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 699     {
 700       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 701       while (p < adjusted_endp)
 702         {
 703           if (! CHAR_BYTE8_HEAD_P (*p)
 704               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 705             {
 706               while (n--)
 707                 *to++ = *p++;
 708             }
 709           else
 710             {
 711               int c = *p++;
 712               c = BYTE8_TO_CHAR (c);
 713               to += CHAR_STRING (c, to);
 714             }
 715         }
 716       chars++;
 717     }
 718   while (p < endp)
 719     {
 720       if (! CHAR_BYTE8_HEAD_P (*p)
 721           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 722         {
 723           while (n--)
 724             *to++ = *p++;
 725         }
 726       else
 727         {
 728           int c = *p++;
 729           c = BYTE8_TO_CHAR (c);
 730           to += CHAR_STRING (c, to);
 731         }
 732       chars++;
 733     }
 734   if (nchars)
 735     *nchars = chars;
 736   return (to - str);
 737 }
 738
 739 /* Parse unibyte string at STR of LEN bytes, and return the number of
 740    bytes it may ocupy when converted to multibyte string by
 741    `str_to_multibyte'.  */
 742
 743 int
 744 parse_str_to_multibyte (str, len)
 745      unsigned char *str;
 746      int len;
 747 {
 748   unsigned char *endp = str + len;
 749   int bytes;
 750
 751   for (bytes = 0; str < endp; str++)
 752     bytes += (*str < 0x80) ? 1 : 2;
 753   return bytes;
 754 }
 755
 756
 757 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 758    that contains the same single-byte characters.  It actually
 759    converts all 8-bit characters to multibyte forms.  It is assured
 760    that we can use LEN bytes at STR as a work area and that is
 761    enough.  */
 762
 763 int
 764 str_to_multibyte (str, len, bytes)
 765      unsigned char *str;
 766      int len, bytes;
 767 {
 768   unsigned char *p = str, *endp = str + bytes;
 769   unsigned char *to;
 770
 771   while (p < endp && *p < 0x80) p++;
 772   if (p == endp)
 773     return bytes;
 774   to = p;
 775   bytes = endp - p;
 776   endp = str + len;
 777   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 778   p = endp - bytes;
 779   while (p < endp)
 780     {
 781       int c = *p++;
 782
 783       if (c >= 0x80)
 784         c = BYTE8_TO_CHAR (c);
 785       to += CHAR_STRING (c, to);
 786     }
 787   return (to - str);
 788 }
 789
 790 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 791    actually converts characters in the range 0x80..0xFF to
 792    unibyte.  */
 793
 794 int
 795 str_as_unibyte (str, bytes)
 796      unsigned char *str;
 797      int bytes;
 798 {
 799   const unsigned char *p = str, *endp = str + bytes;
 800   unsigned char *to;
 801   int c, len;
 802
 803   while (p < endp)
 804     {
 805       c = *p;
 806       len = BYTES_BY_CHAR_HEAD (c);
 807       if (CHAR_BYTE8_HEAD_P (c))
 808         break;
 809       p += len;
 810     }
 811   to = str + (p - str);
 812   while (p < endp)
 813     {
 814       c = *p;
 815       len = BYTES_BY_CHAR_HEAD (c);
 816       if (CHAR_BYTE8_HEAD_P (c))
 817         {
 818           c = STRING_CHAR_ADVANCE (p);
 819           *to++ = CHAR_TO_BYTE8 (c);
 820         }
 821       else
 822         {
 823           while (len--) *to++ = *p++;
 824         }
 825     }
 826   return (to - str);
 827 }
 828
 829 /* Convert eight-bit chars in SRC (in multibyte form) to the
 830    corresponding byte and store in DST.  CHARS is the number of
 831    characters in SRC.  The value is the number of bytes stored in DST.
 832    Usually, the value is the same as CHARS, but is less than it if SRC
 833    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 834    is nonzero, a Latin-1 character is accepted and converted to a byte
 835    of that character code.
 836    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 837
 838 EMACS_INT
 839 str_to_unibyte (src, dst, chars, accept_latin_1)
 840      const unsigned char *src;
 841      unsigned char *dst;
 842      EMACS_INT chars;
 843      int accept_latin_1;
 844 {
 845   EMACS_INT i;
 846
 847   for (i = 0; i < chars; i++)
 848     {
 849       int c = STRING_CHAR_ADVANCE (src);
 850
 851       if (CHAR_BYTE8_P (c))
 852         c = CHAR_TO_BYTE8 (c);
 853       else if (! ASCII_CHAR_P (c)
 854                && (! accept_latin_1 || c >= 0x100))
 855         return i;
 856       *dst++ = c;
 857     }
 858   return i;
 859 }
 860
 861
 862 int
 863 string_count_byte8 (string)
 864      Lisp_Object string;
 865 {
 866   int multibyte = STRING_MULTIBYTE (string);
 867   int nbytes = SBYTES (string);
 868   unsigned char *p = SDATA (string);
 869   unsigned char *pend = p + nbytes;
 870   int count = 0;
 871   int c, len;
 872
 873   if (multibyte)
 874     while (p < pend)
 875       {
 876         c = *p;
 877         len = BYTES_BY_CHAR_HEAD (c);
 878
 879         if (CHAR_BYTE8_HEAD_P (c))
 880           count++;
 881         p += len;
 882       }
 883   else
 884     while (p < pend)
 885       {
 886         if (*p++ >= 0x80)
 887           count++;
 888       }
 889   return count;
 890 }
 891
 892
 893 Lisp_Object
 894 string_escape_byte8 (string)
 895      Lisp_Object string;
 896 {
 897   int nchars = SCHARS (string);
 898   int nbytes = SBYTES (string);
 899   int multibyte = STRING_MULTIBYTE (string);
 900   int byte8_count;
 901   const unsigned char *src, *src_end;
 902   unsigned char *dst;
 903   Lisp_Object val;
 904   int c, len;
 905
 906   if (multibyte && nchars == nbytes)
 907     return string;
 908
 909   byte8_count = string_count_byte8 (string);
 910
 911   if (byte8_count == 0)
 912     return string;
 913
 914   if (multibyte)
 915     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 916     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 917                                         nbytes + byte8_count * 2);
 918   else
 919     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 920     val = make_uninit_string (nbytes + byte8_count * 3);
 921
 922   src = SDATA (string);
 923   src_end = src + nbytes;
 924   dst = SDATA (val);
 925   if (multibyte)
 926     while (src < src_end)
 927       {
 928         c = *src;
 929         len = BYTES_BY_CHAR_HEAD (c);
 930
 931         if (CHAR_BYTE8_HEAD_P (c))
 932           {
 933             c = STRING_CHAR_ADVANCE (src);
 934             c = CHAR_TO_BYTE8 (c);
 935             sprintf ((char *) dst, "\\%03o", c);
 936             dst += 4;
 937           }
 938         else
 939           while (len--) *dst++ = *src++;
 940       }
 941   else
 942     while (src < src_end)
 943       {
 944         c = *src++;
 945         if (c >= 0x80)
 946           {
 947             sprintf ((char *) dst, "\\%03o", c);
 948             dst += 4;
 949           }
 950         else
 951           *dst++ = c;
 952       }
 953   return val;
 954 }
 955
 956 \f
 957 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 958        doc: /*
 959 Concatenate all the argument characters and make the result a string.
 960 usage: (string &rest CHARACTERS)  */)
 961      (n, args)
 962      int n;
 963      Lisp_Object *args;
 964 {
 965   int i;
 966   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 967   unsigned char *p = buf;
 968   int c;
 969
 970   for (i = 0; i < n; i++)
 971     {
 972       CHECK_CHARACTER (args[i]);
 973       c = XINT (args[i]);
 974       p += CHAR_STRING (c, p);
 975     }
 976
 977   return make_string_from_bytes ((char *) buf, n, p - buf);
 978 }
 979
 980 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 981        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 982 usage: (unibyte-string &rest BYTES)  */)
 983      (n, args)
 984      int n;
 985      Lisp_Object *args;
 986 {
 987   int i;
 988   unsigned char *buf = (unsigned char *) alloca (n);
 989   unsigned char *p = buf;
 990   unsigned c;
 991
 992   for (i = 0; i < n; i++)
 993     {
 994       CHECK_NATNUM (args[i]);
 995       c = XFASTINT (args[i]);
 996       if (c >= 256)
 997         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 998       *p++ = c;
 999     }
1000
1001   return make_string_from_bytes ((char *) buf, n, p - buf);
1002 }
1003
1004 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1005        Schar_resolve_modifiers, 1, 1, 0,
1006        doc: /* Resolve modifiers in the character CHAR.
1007 The value is a character with modifiers resolved into the character
1008 code.  Unresolved modifiers are kept in the value.
1009 usage: (char-resolve-modifiers CHAR)  */)
1010      (character)
1011      Lisp_Object character;
1012 {
1013   int c;
1014
1015   CHECK_NUMBER (character);
1016   c = XINT (character);
1017   return make_number (char_resolve_modifier_mask (c));
1018 }
1019
1020 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1021        doc: /* Return a byte value of a character at point.
1022 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1023 a byte value.
1024 Optional 2nd arg STRING, if non-nil, is a string of which first
1025 character is a target to get a byte value.  In this case, POSITION, if
1026 non-nil, is an index of a target character in the string.
1027
1028 If the current buffer (or STRING) is multibyte, and the target
1029 character is not ASCII nor 8-bit character, an error is signalled.  */)
1030      (position, string)
1031      Lisp_Object position, string;
1032 {
1033   int c;
1034   EMACS_INT pos;
1035   unsigned char *p;
1036
1037   if (NILP (string))
1038     {
1039       if (NILP (position))
1040         {
1041           p = PT_ADDR;
1042         }
1043       else
1044         {
1045           CHECK_NUMBER_COERCE_MARKER (position);
1046           if (XINT (position) < BEGV || XINT (position) >= ZV)
1047             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1048           pos = XFASTINT (position);
1049           p = CHAR_POS_ADDR (pos);
1050         }
1051       if (NILP (BUF_ENABLE_MULTIBYTE_CHARACTERS (current_buffer)))
1052         return make_number (*p);
1053     }
1054   else
1055     {
1056       CHECK_STRING (string);
1057       if (NILP (position))
1058         {
1059           p = SDATA (string);
1060         }
1061       else
1062         {
1063           CHECK_NATNUM (position);
1064           if (XINT (position) >= SCHARS (string))
1065             args_out_of_range (string, position);
1066           pos = XFASTINT (position);
1067           p = SDATA (string) + string_char_to_byte (string, pos);
1068         }
1069       if (! STRING_MULTIBYTE (string))
1070         return make_number (*p);
1071     }
1072   c = STRING_CHAR (p);
1073   if (CHAR_BYTE8_P (c))
1074     c = CHAR_TO_BYTE8 (c);
1075   else if (! ASCII_CHAR_P (c))
1076     error ("Not an ASCII nor an 8-bit character: %d", c);
1077   return make_number (c);
1078 }
1079
1080
1081 void
1082 init_character_once ()
1083 {
1084 }
1085
1086 #ifdef emacs
1087
1088 void
1089 syms_of_character ()
1090 {
1091   DEFSYM (Qcharacterp, "characterp");
1092   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1093
1094   staticpro (&Vchar_unify_table);
1095   Vchar_unify_table = Qnil;
1096
1097   defsubr (&Smax_char);
1098   defsubr (&Scharacterp);
1099   defsubr (&Sunibyte_char_to_multibyte);
1100   defsubr (&Smultibyte_char_to_unibyte);
1101   defsubr (&Schar_bytes);
1102   defsubr (&Schar_width);
1103   defsubr (&Sstring_width);
1104   defsubr (&Schar_direction);
1105   defsubr (&Sstring);
1106   defsubr (&Sunibyte_string);
1107   defsubr (&Schar_resolve_modifiers);
1108   defsubr (&Sget_byte);
1109
1110   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1111                doc: /*
1112 Vector recording all translation tables ever defined.
1113 Each element is a pair (SYMBOL . TABLE) relating the table to the
1114 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1115   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1116
1117   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1118                doc: /*
1119 A char-table for characters which invoke auto-filling.
1120 Such characters have value t in this table.  */);
1121   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1122   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1123   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1124
1125   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1126                doc: /*
1127 A char-table for width (columns) of each character.  */);
1128   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1129   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1130   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1131                         make_number (4));
1132
1133   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1134                doc: /* A char-table for direction of each character.  */);
1135   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1136
1137   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1138                doc: /* A char-table for each printable character.  */);
1139   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1140   Fset_char_table_range (Vprintable_chars,
1141                          Fcons (make_number (32), make_number (126)), Qt);
1142   Fset_char_table_range (Vprintable_chars,
1143                          Fcons (make_number (160),
1144                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1145
1146   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1147                doc: /* Char table of script symbols.
1148 It has one extra slot whose value is a list of script symbols.  */);
1149
1150   /* Intern this now in case it isn't already done.
1151      Setting this variable twice is harmless.
1152      But don't staticpro it here--that is done in alloc.c.  */
1153   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1154   DEFSYM (Qchar_script_table, "char-script-table");
1155   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1156   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1157
1158   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1159                doc: /* Alist of scripts vs the representative characters.
1160 Each element is a cons (SCRIPT . CHARS).
1161 SCRIPT is a symbol representing a script or a subgroup of a script.
1162 CHARS is a list or a vector of characters.
1163 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1164 If it is a vector, one of the characters in the vector is necessary.
1165 This variable is used to find a font for a specific script.  */);
1166   Vscript_representative_chars = Qnil;
1167
1168   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1169                doc: /* Char table of Unicode's "General Category".
1170 All Unicode characters have one of the following values (symbol):
1171   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1172   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1173 See The Unicode Standard for the meaning of those values.  */);
1174   /* The correct char-table is setup in characters.el.  */
1175   Vunicode_category_table = Qnil;
1176 }
1177
1178 #endif /* emacs */
1179
1180 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1181    (do not change this comment) */