src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 Lisp_Object Vunicode_category_table;
  90 \f
  91
  92 /* If character code C has modifier masks, reflect them to the
  93    character code if possible.  Return the resulting code.  */
  94
  95 int
  96 char_resolve_modifier_mask (c)
  97      int c;
  98 {
  99   /* A non-ASCII character can't reflect modifier bits to the code.  */
 100   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 101     return c;
 102
 103   /* For Meta, Shift, and Control modifiers, we need special care.  */
 104   if (c & CHAR_SHIFT)
 105     {
 106       /* Shift modifier is valid only with [A-Za-z].  */
 107       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 108         c &= ~CHAR_SHIFT;
 109       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 110         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 111       /* Shift modifier for control characters and SPC is ignored.  */
 112       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 113         c &= ~CHAR_SHIFT;
 114     }
 115   if (c & CHAR_CTL)
 116     {
 117       /* Simulate the code in lread.c.  */
 118       /* Allow `\C- ' and `\C-?'.  */
 119       if ((c & 0377) == ' ')
 120         c &= ~0177 & ~ CHAR_CTL;
 121       else if ((c & 0377) == '?')
 122         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 123       /* ASCII control chars are made from letters (both cases),
 124          as well as the non-letters within 0100...0137.  */
 125       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 126         c &= (037 | (~0177 & ~CHAR_CTL));
 127       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 128         c &= (037 | (~0177 & ~CHAR_CTL));
 129     }
 130 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 131   if (c & CHAR_META)
 132     {
 133       /* Move the meta bit to the right place for a string.  */
 134       c = (c & ~CHAR_META) | 0x80;
 135     }
 136 #endif
 137
 138   return c;
 139 }
 140
 141
 142 /* Store multibyte form of character C at P.  If C has modifier bits,
 143    handle them appropriately.  */
 144
 145 int
 146 char_string (c, p)
 147      unsigned c;
 148      unsigned char *p;
 149 {
 150   int bytes;
 151
 152   if (c & CHAR_MODIFIER_MASK)
 153     {
 154       c = (unsigned) char_resolve_modifier_mask ((int) c);
 155       /* If C still has any modifier bits, just ignore it.  */
 156       c &= ~CHAR_MODIFIER_MASK;
 157     }
 158
 159   MAYBE_UNIFY_CHAR (c);
 160
 161   if (c <= MAX_3_BYTE_CHAR)
 162     {
 163       bytes = CHAR_STRING (c, p);
 164     }
 165   else if (c <= MAX_4_BYTE_CHAR)
 166     {
 167       p[0] = (0xF0 | (c >> 18));
 168       p[1] = (0x80 | ((c >> 12) & 0x3F));
 169       p[2] = (0x80 | ((c >> 6) & 0x3F));
 170       p[3] = (0x80 | (c & 0x3F));
 171       bytes = 4;
 172     }
 173   else if (c <= MAX_5_BYTE_CHAR)
 174     {
 175       p[0] = 0xF8;
 176       p[1] = (0x80 | ((c >> 18) & 0x0F));
 177       p[2] = (0x80 | ((c >> 12) & 0x3F));
 178       p[3] = (0x80 | ((c >> 6) & 0x3F));
 179       p[4] = (0x80 | (c & 0x3F));
 180       bytes = 5;
 181     }
 182   else if (c <= MAX_CHAR)
 183     {
 184       c = CHAR_TO_BYTE8 (c);
 185       bytes = BYTE8_STRING (c, p);
 186     }
 187   else
 188     error ("Invalid character: %d", c);
 189
 190   return bytes;
 191 }
 192
 193
 194 /* Return a character whose multibyte form is at P.  Set LEN is not
 195    NULL, it must be a pointer to integer.  In that case, set *LEN to
 196    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 197    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 198    the ending address (i.e. the starting address of the next
 199    character) of the multibyte form.  */
 200
 201 int
 202 string_char (p, advanced, len)
 203      const unsigned char *p;
 204      const unsigned char **advanced;
 205      int *len;
 206 {
 207   int c;
 208   const unsigned char *saved_p = p;
 209
 210   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 211     {
 212       c = STRING_CHAR_ADVANCE (p);
 213     }
 214   else if (! (*p & 0x08))
 215     {
 216       c = ((((p)[0] & 0xF) << 18)
 217            | (((p)[1] & 0x3F) << 12)
 218            | (((p)[2] & 0x3F) << 6)
 219            | ((p)[3] & 0x3F));
 220       p += 4;
 221     }
 222   else
 223     {
 224       c = ((((p)[1] & 0x3F) << 18)
 225            | (((p)[2] & 0x3F) << 12)
 226            | (((p)[3] & 0x3F) << 6)
 227            | ((p)[4] & 0x3F));
 228       p += 5;
 229     }
 230
 231   MAYBE_UNIFY_CHAR (c);
 232
 233   if (len)
 234     *len = p - saved_p;
 235   if (advanced)
 236     *advanced = p;
 237   return c;
 238 }
 239
 240
 241 /* Translate character C by translation table TABLE.  If C is
 242    negative, translate a character specified by CHARSET and CODE.  If
 243    no translation is found in TABLE, return the untranslated
 244    character.  If TABLE is a list, elements are char tables.  In this
 245    case, translace C by all tables.  */
 246
 247 int
 248 translate_char (table, c)
 249      Lisp_Object table;
 250      int c;
 251 {
 252   if (CHAR_TABLE_P (table))
 253     {
 254       Lisp_Object ch;
 255
 256       ch = CHAR_TABLE_REF (table, c);
 257       if (CHARACTERP (ch))
 258         c = XINT (ch);
 259     }
 260   else
 261     {
 262       for (; CONSP (table); table = XCDR (table))
 263         c = translate_char (XCAR (table), c);
 264     }
 265   return c;
 266 }
 267
 268 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 269    them, return (C & 0xFF).
 270
 271    The argument REV_TBL is now ignored.  It will be removed in the
 272    future.  */
 273
 274 int
 275 multibyte_char_to_unibyte (c, rev_tbl)
 276      int c;
 277      Lisp_Object rev_tbl;
 278 {
 279   if (c < 0x80)
 280     return c;
 281   if (CHAR_BYTE8_P (c))
 282     return CHAR_TO_BYTE8 (c);
 283   return (c & 0xFF);
 284 }
 285
 286 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 287    by charset_unibyte.  */
 288
 289 int
 290 multibyte_char_to_unibyte_safe (c)
 291      int c;
 292 {
 293   if (c < 0x80)
 294     return c;
 295   if (CHAR_BYTE8_P (c))
 296     return CHAR_TO_BYTE8 (c);
 297   return -1;
 298 }
 299
 300 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 301        doc: /* Return non-nil if OBJECT is a character.  */)
 302      (object, ignore)
 303      Lisp_Object object, ignore;
 304 {
 305   return (CHARACTERP (object) ? Qt : Qnil);
 306 }
 307
 308 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 309        doc: /* Return the character of the maximum code.  */)
 310      ()
 311 {
 312   return make_number (MAX_CHAR);
 313 }
 314
 315 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 316        Sunibyte_char_to_multibyte, 1, 1, 0,
 317        doc: /* Convert the byte CH to multibyte character.  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   if (c >= 0x100)
 326     error ("Not a unibyte character: %d", c);
 327   MAKE_CHAR_MULTIBYTE (c);
 328   return make_number (c);
 329 }
 330
 331 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 332        Smultibyte_char_to_unibyte, 1, 1, 0,
 333        doc: /* Convert the multibyte character CH to a byte.
 334 If the multibyte character does not represent a byte, return -1.  */)
 335      (ch)
 336      Lisp_Object ch;
 337 {
 338   int cm;
 339
 340   CHECK_CHARACTER (ch);
 341   cm = XFASTINT (ch);
 342   if (cm < 256)
 343     /* Can't distinguish a byte read from a unibyte buffer from
 344        a latin1 char, so let's let it slide.  */
 345     return ch;
 346   else
 347     {
 348       int cu = CHAR_TO_BYTE_SAFE (cm);
 349       return make_number (cu);
 350     }
 351 }
 352
 353 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 354        doc: /* Return 1 regardless of the argument CHAR.
 355 This is now an obsolete function.  We keep it just for backward compatibility.
 356 usage: (char-bytes CHAR)  */)
 357      (ch)
 358      Lisp_Object ch;
 359 {
 360   CHECK_CHARACTER (ch);
 361   return make_number (1);
 362 }
 363
 364 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 365        doc: /* Return width of CHAR when displayed in the current buffer.
 366 The width is measured by how many columns it occupies on the screen.
 367 Tab is taken to occupy `tab-width' columns.
 368 usage: (char-width CHAR)  */)
 369      (ch)
 370        Lisp_Object ch;
 371 {
 372   Lisp_Object disp;
 373   int c, width;
 374   struct Lisp_Char_Table *dp = buffer_display_table ();
 375
 376   CHECK_CHARACTER (ch);
 377   c = XINT (ch);
 378
 379   /* Get the way the display table would display it.  */
 380   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 381
 382   if (VECTORP (disp))
 383     width = ASIZE (disp);
 384   else
 385     width = CHAR_WIDTH (c);
 386
 387   return make_number (width);
 388 }
 389
 390 /* Return width of string STR of length LEN when displayed in the
 391    current buffer.  The width is measured by how many columns it
 392    occupies on the screen.  If PRECISION > 0, return the width of
 393    longest substring that doesn't exceed PRECISION, and set number of
 394    characters and bytes of the substring in *NCHARS and *NBYTES
 395    respectively.  */
 396
 397 int
 398 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 399 {
 400   int i = 0, i_byte = 0;
 401   int width = 0;
 402   struct Lisp_Char_Table *dp = buffer_display_table ();
 403
 404   while (i_byte < len)
 405     {
 406       int bytes, thiswidth;
 407       Lisp_Object val;
 408       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 409
 410       if (dp)
 411         {
 412           val = DISP_CHAR_VECTOR (dp, c);
 413           if (VECTORP (val))
 414             thiswidth = XVECTOR_SIZE (val);
 415           else
 416             thiswidth = CHAR_WIDTH (c);
 417         }
 418       else
 419         {
 420           thiswidth = CHAR_WIDTH (c);
 421         }
 422
 423       if (precision > 0
 424           && (width + thiswidth > precision))
 425         {
 426           *nchars = i;
 427           *nbytes = i_byte;
 428           return width;
 429         }
 430       i++;
 431       i_byte += bytes;
 432       width += thiswidth;
 433   }
 434
 435   if (precision > 0)
 436     {
 437       *nchars = i;
 438       *nbytes = i_byte;
 439     }
 440
 441   return width;
 442 }
 443
 444 /* Return width of string STR of length LEN when displayed in the
 445    current buffer.  The width is measured by how many columns it
 446    occupies on the screen.  */
 447
 448 int
 449 strwidth (str, len)
 450      unsigned char *str;
 451      int len;
 452 {
 453   return c_string_width (str, len, -1, NULL, NULL);
 454 }
 455
 456 /* Return width of Lisp string STRING when displayed in the current
 457    buffer.  The width is measured by how many columns it occupies on
 458    the screen while paying attention to compositions.  If PRECISION >
 459    0, return the width of longest substring that doesn't exceed
 460    PRECISION, and set number of characters and bytes of the substring
 461    in *NCHARS and *NBYTES respectively.  */
 462
 463 int
 464 lisp_string_width (string, precision, nchars, nbytes)
 465      Lisp_Object string;
 466      int precision, *nchars, *nbytes;
 467 {
 468   int len = SCHARS (string);
 469   /* This set multibyte to 0 even if STRING is multibyte when it
 470      contains only ascii and eight-bit-graphic, but that's
 471      intentional.  */
 472   int multibyte = len < SBYTES (string);
 473   unsigned char *str = SDATA (string);
 474   int i = 0, i_byte = 0;
 475   int width = 0;
 476   struct Lisp_Char_Table *dp = buffer_display_table ();
 477
 478   while (i < len)
 479     {
 480       int chars, bytes, thiswidth;
 481       Lisp_Object val;
 482       int cmp_id;
 483       EMACS_INT ignore, end;
 484
 485       if (find_composition (i, -1, &ignore, &end, &val, string)
 486           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 487               >= 0))
 488         {
 489           thiswidth = composition_table[cmp_id]->width;
 490           chars = end - i;
 491           bytes = string_char_to_byte (string, end) - i_byte;
 492         }
 493       else
 494         {
 495           int c;
 496
 497           if (multibyte)
 498             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 499           else
 500             c = str[i_byte], bytes = 1;
 501           chars = 1;
 502           if (dp)
 503             {
 504               val = DISP_CHAR_VECTOR (dp, c);
 505               if (VECTORP (val))
 506                 thiswidth = XVECTOR_SIZE (val);
 507               else
 508                 thiswidth = CHAR_WIDTH (c);
 509             }
 510           else
 511             {
 512               thiswidth = CHAR_WIDTH (c);
 513             }
 514         }
 515
 516       if (precision > 0
 517           && (width + thiswidth > precision))
 518         {
 519           *nchars = i;
 520           *nbytes = i_byte;
 521           return width;
 522         }
 523       i += chars;
 524       i_byte += bytes;
 525       width += thiswidth;
 526   }
 527
 528   if (precision > 0)
 529     {
 530       *nchars = i;
 531       *nbytes = i_byte;
 532     }
 533
 534   return width;
 535 }
 536
 537 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 538        doc: /* Return width of STRING when displayed in the current buffer.
 539 Width is measured by how many columns it occupies on the screen.
 540 When calculating width of a multibyte character in STRING,
 541 only the base leading-code is considered; the validity of
 542 the following bytes is not checked.  Tabs in STRING are always
 543 taken to occupy `tab-width' columns.
 544 usage: (string-width STRING)  */)
 545      (str)
 546      Lisp_Object str;
 547 {
 548   Lisp_Object val;
 549
 550   CHECK_STRING (str);
 551   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 552   return val;
 553 }
 554
 555 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 556        doc: /* Return the direction of CHAR.
 557 The returned value is 0 for left-to-right and 1 for right-to-left.
 558 usage: (char-direction CHAR)  */)
 559      (ch)
 560      Lisp_Object ch;
 561 {
 562   int c;
 563
 564   CHECK_CHARACTER (ch);
 565   c = XINT (ch);
 566   return CHAR_TABLE_REF (Vchar_direction_table, c);
 567 }
 568
 569 /* Return the number of characters in the NBYTES bytes at PTR.
 570    This works by looking at the contents and checking for multibyte
 571    sequences while assuming that there's no invalid sequence.
 572    However, if the current buffer has enable-multibyte-characters =
 573    nil, we treat each byte as a character.  */
 574
 575 EMACS_INT
 576 chars_in_text (ptr, nbytes)
 577      const unsigned char *ptr;
 578      EMACS_INT nbytes;
 579 {
 580   /* current_buffer is null at early stages of Emacs initialization.  */
 581   if (current_buffer == 0
 582       || NILP (current_buffer->enable_multibyte_characters))
 583     return nbytes;
 584
 585   return multibyte_chars_in_text (ptr, nbytes);
 586 }
 587
 588 /* Return the number of characters in the NBYTES bytes at PTR.
 589    This works by looking at the contents and checking for multibyte
 590    sequences while assuming that there's no invalid sequence.  It
 591    ignores enable-multibyte-characters.  */
 592
 593 EMACS_INT
 594 multibyte_chars_in_text (ptr, nbytes)
 595      const unsigned char *ptr;
 596      EMACS_INT nbytes;
 597 {
 598   const unsigned char *endp = ptr + nbytes;
 599   int chars = 0;
 600
 601   while (ptr < endp)
 602     {
 603       int len = MULTIBYTE_LENGTH (ptr, endp);
 604
 605       if (len == 0)
 606         abort ();
 607       ptr += len;
 608       chars++;
 609     }
 610
 611   return chars;
 612 }
 613
 614 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 615    characters and bytes in it, and store them in *NCHARS and *NBYTES
 616    respectively.  On counting bytes, pay attention to that 8-bit
 617    characters not constructing a valid multibyte sequence are
 618    represented by 2-byte in a multibyte text.  */
 619
 620 void
 621 parse_str_as_multibyte (str, len, nchars, nbytes)
 622      const unsigned char *str;
 623      int len, *nchars, *nbytes;
 624 {
 625   const unsigned char *endp = str + len;
 626   int n, chars = 0, bytes = 0;
 627
 628   if (len >= MAX_MULTIBYTE_LENGTH)
 629     {
 630       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 631       while (str < adjusted_endp)
 632         {
 633           if (! CHAR_BYTE8_HEAD_P (*str)
 634               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 635             str += n, bytes += n;
 636           else
 637             str++, bytes += 2;
 638           chars++;
 639         }
 640     }
 641   while (str < endp)
 642     {
 643       if (! CHAR_BYTE8_HEAD_P (*str)
 644           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 645         str += n, bytes += n;
 646       else
 647         str++, bytes += 2;
 648       chars++;
 649     }
 650
 651   *nchars = chars;
 652   *nbytes = bytes;
 653   return;
 654 }
 655
 656 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 657    It actually converts only such 8-bit characters that don't contruct
 658    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 659    NCHARS is nonzero, set *NCHARS to the number of characters in the
 660    text.  It is assured that we can use LEN bytes at STR as a work
 661    area and that is enough.  Return the number of bytes of the
 662    resulting text.  */
 663
 664 int
 665 str_as_multibyte (str, len, nbytes, nchars)
 666      unsigned char *str;
 667      int len, nbytes, *nchars;
 668 {
 669   unsigned char *p = str, *endp = str + nbytes;
 670   unsigned char *to;
 671   int chars = 0;
 672   int n;
 673
 674   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 675     {
 676       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 677       while (p < adjusted_endp
 678              && ! CHAR_BYTE8_HEAD_P (*p)
 679              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 680         p += n, chars++;
 681     }
 682   while (p < endp
 683          && ! CHAR_BYTE8_HEAD_P (*p)
 684          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 685     p += n, chars++;
 686   if (nchars)
 687     *nchars = chars;
 688   if (p == endp)
 689     return nbytes;
 690
 691   to = p;
 692   nbytes = endp - p;
 693   endp = str + len;
 694   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 695   p = endp - nbytes;
 696
 697   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 698     {
 699       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 700       while (p < adjusted_endp)
 701         {
 702           if (! CHAR_BYTE8_HEAD_P (*p)
 703               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 704             {
 705               while (n--)
 706                 *to++ = *p++;
 707             }
 708           else
 709             {
 710               int c = *p++;
 711               c = BYTE8_TO_CHAR (c);
 712               to += CHAR_STRING (c, to);
 713             }
 714         }
 715       chars++;
 716     }
 717   while (p < endp)
 718     {
 719       if (! CHAR_BYTE8_HEAD_P (*p)
 720           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 721         {
 722           while (n--)
 723             *to++ = *p++;
 724         }
 725       else
 726         {
 727           int c = *p++;
 728           c = BYTE8_TO_CHAR (c);
 729           to += CHAR_STRING (c, to);
 730         }
 731       chars++;
 732     }
 733   if (nchars)
 734     *nchars = chars;
 735   return (to - str);
 736 }
 737
 738 /* Parse unibyte string at STR of LEN bytes, and return the number of
 739    bytes it may ocupy when converted to multibyte string by
 740    `str_to_multibyte'.  */
 741
 742 int
 743 parse_str_to_multibyte (str, len)
 744      unsigned char *str;
 745      int len;
 746 {
 747   unsigned char *endp = str + len;
 748   int bytes;
 749
 750   for (bytes = 0; str < endp; str++)
 751     bytes += (*str < 0x80) ? 1 : 2;
 752   return bytes;
 753 }
 754
 755
 756 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 757    that contains the same single-byte characters.  It actually
 758    converts all 8-bit characters to multibyte forms.  It is assured
 759    that we can use LEN bytes at STR as a work area and that is
 760    enough.  */
 761
 762 int
 763 str_to_multibyte (str, len, bytes)
 764      unsigned char *str;
 765      int len, bytes;
 766 {
 767   unsigned char *p = str, *endp = str + bytes;
 768   unsigned char *to;
 769
 770   while (p < endp && *p < 0x80) p++;
 771   if (p == endp)
 772     return bytes;
 773   to = p;
 774   bytes = endp - p;
 775   endp = str + len;
 776   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 777   p = endp - bytes;
 778   while (p < endp)
 779     {
 780       int c = *p++;
 781
 782       if (c >= 0x80)
 783         c = BYTE8_TO_CHAR (c);
 784       to += CHAR_STRING (c, to);
 785     }
 786   return (to - str);
 787 }
 788
 789 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 790    actually converts characters in the range 0x80..0xFF to
 791    unibyte.  */
 792
 793 int
 794 str_as_unibyte (str, bytes)
 795      unsigned char *str;
 796      int bytes;
 797 {
 798   const unsigned char *p = str, *endp = str + bytes;
 799   unsigned char *to;
 800   int c, len;
 801
 802   while (p < endp)
 803     {
 804       c = *p;
 805       len = BYTES_BY_CHAR_HEAD (c);
 806       if (CHAR_BYTE8_HEAD_P (c))
 807         break;
 808       p += len;
 809     }
 810   to = str + (p - str);
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         {
 817           c = STRING_CHAR_ADVANCE (p);
 818           *to++ = CHAR_TO_BYTE8 (c);
 819         }
 820       else
 821         {
 822           while (len--) *to++ = *p++;
 823         }
 824     }
 825   return (to - str);
 826 }
 827
 828 /* Convert eight-bit chars in SRC (in multibyte form) to the
 829    corresponding byte and store in DST.  CHARS is the number of
 830    characters in SRC.  The value is the number of bytes stored in DST.
 831    Usually, the value is the same as CHARS, but is less than it if SRC
 832    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 833    is nonzero, a Latin-1 character is accepted and converted to a byte
 834    of that character code.
 835    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 836
 837 EMACS_INT
 838 str_to_unibyte (src, dst, chars, accept_latin_1)
 839      const unsigned char *src;
 840      unsigned char *dst;
 841      EMACS_INT chars;
 842      int accept_latin_1;
 843 {
 844   EMACS_INT i;
 845
 846   for (i = 0; i < chars; i++)
 847     {
 848       int c = STRING_CHAR_ADVANCE (src);
 849
 850       if (CHAR_BYTE8_P (c))
 851         c = CHAR_TO_BYTE8 (c);
 852       else if (! ASCII_CHAR_P (c)
 853                && (! accept_latin_1 || c >= 0x100))
 854         return i;
 855       *dst++ = c;
 856     }
 857   return i;
 858 }
 859
 860
 861 int
 862 string_count_byte8 (string)
 863      Lisp_Object string;
 864 {
 865   int multibyte = STRING_MULTIBYTE (string);
 866   int nbytes = SBYTES (string);
 867   unsigned char *p = SDATA (string);
 868   unsigned char *pend = p + nbytes;
 869   int count = 0;
 870   int c, len;
 871
 872   if (multibyte)
 873     while (p < pend)
 874       {
 875         c = *p;
 876         len = BYTES_BY_CHAR_HEAD (c);
 877
 878         if (CHAR_BYTE8_HEAD_P (c))
 879           count++;
 880         p += len;
 881       }
 882   else
 883     while (p < pend)
 884       {
 885         if (*p++ >= 0x80)
 886           count++;
 887       }
 888   return count;
 889 }
 890
 891
 892 Lisp_Object
 893 string_escape_byte8 (string)
 894      Lisp_Object string;
 895 {
 896   int nchars = SCHARS (string);
 897   int nbytes = SBYTES (string);
 898   int multibyte = STRING_MULTIBYTE (string);
 899   int byte8_count;
 900   const unsigned char *src, *src_end;
 901   unsigned char *dst;
 902   Lisp_Object val;
 903   int c, len;
 904
 905   if (multibyte && nchars == nbytes)
 906     return string;
 907
 908   byte8_count = string_count_byte8 (string);
 909
 910   if (byte8_count == 0)
 911     return string;
 912
 913   if (multibyte)
 914     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 915     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 916                                         nbytes + byte8_count * 2);
 917   else
 918     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 919     val = make_uninit_string (nbytes + byte8_count * 3);
 920
 921   src = SDATA (string);
 922   src_end = src + nbytes;
 923   dst = SDATA (val);
 924   if (multibyte)
 925     while (src < src_end)
 926       {
 927         c = *src;
 928         len = BYTES_BY_CHAR_HEAD (c);
 929
 930         if (CHAR_BYTE8_HEAD_P (c))
 931           {
 932             c = STRING_CHAR_ADVANCE (src);
 933             c = CHAR_TO_BYTE8 (c);
 934             sprintf ((char *) dst, "\\%03o", c);
 935             dst += 4;
 936           }
 937         else
 938           while (len--) *dst++ = *src++;
 939       }
 940   else
 941     while (src < src_end)
 942       {
 943         c = *src++;
 944         if (c >= 0x80)
 945           {
 946             sprintf ((char *) dst, "\\%03o", c);
 947             dst += 4;
 948           }
 949         else
 950           *dst++ = c;
 951       }
 952   return val;
 953 }
 954
 955 \f
 956 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 957        doc: /*
 958 Concatenate all the argument characters and make the result a string.
 959 usage: (string &rest CHARACTERS)  */)
 960      (n, args)
 961      int n;
 962      Lisp_Object *args;
 963 {
 964   int i, c;
 965   unsigned char *buf, *p;
 966   Lisp_Object str;
 967   USE_SAFE_ALLOCA;
 968
 969   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 970   p = buf;
 971
 972   for (i = 0; i < n; i++)
 973     {
 974       CHECK_CHARACTER (args[i]);
 975       c = XINT (args[i]);
 976       p += CHAR_STRING (c, p);
 977     }
 978
 979   str = make_string_from_bytes ((char *) buf, n, p - buf);
 980   SAFE_FREE ();
 981   return str;
 982 }
 983
 984 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 985        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 986 usage: (unibyte-string &rest BYTES)  */)
 987      (n, args)
 988      int n;
 989      Lisp_Object *args;
 990 {
 991   int i, c;
 992   unsigned char *buf, *p;
 993   Lisp_Object str;
 994   USE_SAFE_ALLOCA;
 995
 996   SAFE_ALLOCA (buf, unsigned char *, n);
 997   p = buf;
 998
 999   for (i = 0; i < n; i++)
1000     {
1001       CHECK_NATNUM (args[i]);
1002       c = XFASTINT (args[i]);
1003       if (c >= 256)
1004         args_out_of_range_3 (args[i], make_number (0), make_number (255));
1005       *p++ = c;
1006     }
1007
1008   str = make_string_from_bytes ((char *) buf, n, p - buf);
1009   SAFE_FREE ();
1010   return str;
1011 }
1012
1013 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1014        Schar_resolve_modifiers, 1, 1, 0,
1015        doc: /* Resolve modifiers in the character CHAR.
1016 The value is a character with modifiers resolved into the character
1017 code.  Unresolved modifiers are kept in the value.
1018 usage: (char-resolve-modifiers CHAR)  */)
1019      (character)
1020      Lisp_Object character;
1021 {
1022   int c;
1023
1024   CHECK_NUMBER (character);
1025   c = XINT (character);
1026   return make_number (char_resolve_modifier_mask (c));
1027 }
1028
1029 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1030        doc: /* Return a byte value of a character at point.
1031 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1032 a byte value.
1033 Optional 2nd arg STRING, if non-nil, is a string of which first
1034 character is a target to get a byte value.  In this case, POSITION, if
1035 non-nil, is an index of a target character in the string.
1036
1037 If the current buffer (or STRING) is multibyte, and the target
1038 character is not ASCII nor 8-bit character, an error is signalled.  */)
1039      (position, string)
1040      Lisp_Object position, string;
1041 {
1042   int c;
1043   EMACS_INT pos;
1044   unsigned char *p;
1045
1046   if (NILP (string))
1047     {
1048       if (NILP (position))
1049         {
1050           p = PT_ADDR;
1051         }
1052       else
1053         {
1054           CHECK_NUMBER_COERCE_MARKER (position);
1055           if (XINT (position) < BEGV || XINT (position) >= ZV)
1056             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1057           pos = XFASTINT (position);
1058           p = CHAR_POS_ADDR (pos);
1059         }
1060       if (NILP (current_buffer->enable_multibyte_characters))
1061         return make_number (*p);
1062     }
1063   else
1064     {
1065       CHECK_STRING (string);
1066       if (NILP (position))
1067         {
1068           p = SDATA (string);
1069         }
1070       else
1071         {
1072           CHECK_NATNUM (position);
1073           if (XINT (position) >= SCHARS (string))
1074             args_out_of_range (string, position);
1075           pos = XFASTINT (position);
1076           p = SDATA (string) + string_char_to_byte (string, pos);
1077         }
1078       if (! STRING_MULTIBYTE (string))
1079         return make_number (*p);
1080     }
1081   c = STRING_CHAR (p);
1082   if (CHAR_BYTE8_P (c))
1083     c = CHAR_TO_BYTE8 (c);
1084   else if (! ASCII_CHAR_P (c))
1085     error ("Not an ASCII nor an 8-bit character: %d", c);
1086   return make_number (c);
1087 }
1088
1089
1090 void
1091 init_character_once ()
1092 {
1093 }
1094
1095 #ifdef emacs
1096
1097 void
1098 syms_of_character ()
1099 {
1100   DEFSYM (Qcharacterp, "characterp");
1101   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1102
1103   staticpro (&Vchar_unify_table);
1104   Vchar_unify_table = Qnil;
1105
1106   defsubr (&Smax_char);
1107   defsubr (&Scharacterp);
1108   defsubr (&Sunibyte_char_to_multibyte);
1109   defsubr (&Smultibyte_char_to_unibyte);
1110   defsubr (&Schar_bytes);
1111   defsubr (&Schar_width);
1112   defsubr (&Sstring_width);
1113   defsubr (&Schar_direction);
1114   defsubr (&Sstring);
1115   defsubr (&Sunibyte_string);
1116   defsubr (&Schar_resolve_modifiers);
1117   defsubr (&Sget_byte);
1118
1119   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1120                doc: /*
1121 Vector recording all translation tables ever defined.
1122 Each element is a pair (SYMBOL . TABLE) relating the table to the
1123 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1124   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1125
1126   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1127                doc: /*
1128 A char-table for characters which invoke auto-filling.
1129 Such characters have value t in this table.  */);
1130   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1131   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1132   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1133
1134   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1135                doc: /*
1136 A char-table for width (columns) of each character.  */);
1137   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1138   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1139   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1140                         make_number (4));
1141
1142   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1143                doc: /* A char-table for direction of each character.  */);
1144   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1145
1146   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1147                doc: /* A char-table for each printable character.  */);
1148   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1149   Fset_char_table_range (Vprintable_chars,
1150                          Fcons (make_number (32), make_number (126)), Qt);
1151   Fset_char_table_range (Vprintable_chars,
1152                          Fcons (make_number (160),
1153                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1154
1155   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1156                doc: /* Char table of script symbols.
1157 It has one extra slot whose value is a list of script symbols.  */);
1158
1159   /* Intern this now in case it isn't already done.
1160      Setting this variable twice is harmless.
1161      But don't staticpro it here--that is done in alloc.c.  */
1162   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1163   DEFSYM (Qchar_script_table, "char-script-table");
1164   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1165   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1166
1167   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1168                doc: /* Alist of scripts vs the representative characters.
1169 Each element is a cons (SCRIPT . CHARS).
1170 SCRIPT is a symbol representing a script or a subgroup of a script.
1171 CHARS is a list or a vector of characters.
1172 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1173 If it is a vector, one of the characters in the vector is necessary.
1174 This variable is used to find a font for a specific script.  */);
1175   Vscript_representative_chars = Qnil;
1176
1177   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1178                doc: /* Char table of Unicode's "General Category".
1179 All Unicode characters have one of the following values (symbol):
1180   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1181   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1182 See The Unicode Standard for the meaning of those values.  */);
1183   /* The correct char-table is setup in characters.el.  */
1184   Vunicode_category_table = Qnil;
1185 }
1186
1187 #endif /* emacs */
1188
1189 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1190    (do not change this comment) */