src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 /* Mapping table from unibyte chars to multibyte chars.  */
  89 int unibyte_to_multibyte_table[256];
  90
  91 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  92    char.  */
  93 char unibyte_has_multibyte_table[256];
  94
  95 \f
  96
  97 /* If character code C has modifier masks, reflect them to the
  98    character code if possible.  Return the resulting code.  */
  99
 100 int
 101 char_resolve_modifier_mask (c)
 102      int c;
 103 {
 104   /* A non-ASCII character can't reflect modifier bits to the code.  */
 105   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 106     return c;
 107
 108   /* For Meta, Shift, and Control modifiers, we need special care.  */
 109   if (c & CHAR_SHIFT)
 110     {
 111       /* Shift modifier is valid only with [A-Za-z].  */
 112       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 113         c &= ~CHAR_SHIFT;
 114       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 115         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 116       /* Shift modifier for control characters and SPC is ignored.  */
 117       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 118         c &= ~CHAR_SHIFT;
 119     }
 120   if (c & CHAR_CTL)
 121     {
 122       /* Simulate the code in lread.c.  */
 123       /* Allow `\C- ' and `\C-?'.  */
 124       if ((c & 0377) == ' ')
 125         c &= ~0177 & ~ CHAR_CTL;
 126       else if ((c & 0377) == '?')
 127         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 128       /* ASCII control chars are made from letters (both cases),
 129          as well as the non-letters within 0100...0137.  */
 130       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 131         c &= (037 | (~0177 & ~CHAR_CTL));
 132       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 133         c &= (037 | (~0177 & ~CHAR_CTL));
 134     }
 135   if (c & CHAR_META)
 136     {
 137       /* Move the meta bit to the right place for a string.  */
 138       c = (c & ~CHAR_META) | 0x80;
 139     }
 140
 141   return c;
 142 }
 143
 144
 145 /* Store multibyte form of character C at P.  If C has modifier bits,
 146    handle them appropriately.  */
 147
 148 int
 149 char_string (c, p)
 150      unsigned c;
 151      unsigned char *p;
 152 {
 153   int bytes;
 154
 155   if (c & CHAR_MODIFIER_MASK)
 156     {
 157       c = (unsigned) char_resolve_modifier_mask ((int) c);
 158       /* If C still has any modifier bits, just ignore it.  */
 159       c &= ~CHAR_MODIFIER_MASK;
 160     }
 161
 162   MAYBE_UNIFY_CHAR (c);
 163
 164   if (c <= MAX_3_BYTE_CHAR)
 165     {
 166       bytes = CHAR_STRING (c, p);
 167     }
 168   else if (c <= MAX_4_BYTE_CHAR)
 169     {
 170       p[0] = (0xF0 | (c >> 18));
 171       p[1] = (0x80 | ((c >> 12) & 0x3F));
 172       p[2] = (0x80 | ((c >> 6) & 0x3F));
 173       p[3] = (0x80 | (c & 0x3F));
 174       bytes = 4;
 175     }
 176   else if (c <= MAX_5_BYTE_CHAR)
 177     {
 178       p[0] = 0xF8;
 179       p[1] = (0x80 | ((c >> 18) & 0x0F));
 180       p[2] = (0x80 | ((c >> 12) & 0x3F));
 181       p[3] = (0x80 | ((c >> 6) & 0x3F));
 182       p[4] = (0x80 | (c & 0x3F));
 183       bytes = 5;
 184     }
 185   else if (c <= MAX_CHAR)
 186     {
 187       c = CHAR_TO_BYTE8 (c);
 188       bytes = BYTE8_STRING (c, p);
 189     }
 190   else
 191     error ("Invalid character: %d", c);
 192
 193   return bytes;
 194 }
 195
 196
 197 /* Return a character whose multibyte form is at P.  Set LEN is not
 198    NULL, it must be a pointer to integer.  In that case, set *LEN to
 199    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 200    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 201    the ending address (i.e. the starting address of the next
 202    character) of the multibyte form.  */
 203
 204 int
 205 string_char (p, advanced, len)
 206      const unsigned char *p;
 207      const unsigned char **advanced;
 208      int *len;
 209 {
 210   int c;
 211   const unsigned char *saved_p = p;
 212
 213   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 214     {
 215       c = STRING_CHAR_ADVANCE (p);
 216     }
 217   else if (! (*p & 0x08))
 218     {
 219       c = ((((p)[0] & 0xF) << 18)
 220            | (((p)[1] & 0x3F) << 12)
 221            | (((p)[2] & 0x3F) << 6)
 222            | ((p)[3] & 0x3F));
 223       p += 4;
 224     }
 225   else
 226     {
 227       c = ((((p)[1] & 0x3F) << 18)
 228            | (((p)[2] & 0x3F) << 12)
 229            | (((p)[3] & 0x3F) << 6)
 230            | ((p)[4] & 0x3F));
 231       p += 5;
 232     }
 233
 234   MAYBE_UNIFY_CHAR (c);
 235
 236   if (len)
 237     *len = p - saved_p;
 238   if (advanced)
 239     *advanced = p;
 240   return c;
 241 }
 242
 243
 244 /* Translate character C by translation table TABLE.  If C is
 245    negative, translate a character specified by CHARSET and CODE.  If
 246    no translation is found in TABLE, return the untranslated
 247    character.  If TABLE is a list, elements are char tables.  In this
 248    case, translace C by all tables.  */
 249
 250 int
 251 translate_char (table, c)
 252      Lisp_Object table;
 253      int c;
 254 {
 255   if (CHAR_TABLE_P (table))
 256     {
 257       Lisp_Object ch;
 258
 259       ch = CHAR_TABLE_REF (table, c);
 260       if (CHARACTERP (ch))
 261         c = XINT (ch);
 262     }
 263   else
 264     {
 265       for (; CONSP (table); table = XCDR (table))
 266         c = translate_char (XCAR (table), c);
 267     }
 268   return c;
 269 }
 270
 271 /* Convert the multibyte character C to unibyte 8-bit character based
 272    on the current value of charset_unibyte.  If dimension of
 273    charset_unibyte is more than one, return (C & 0xFF).
 274
 275    The argument REV_TBL is now ignored.  It will be removed in the
 276    future.  */
 277
 278 int
 279 multibyte_char_to_unibyte (c, rev_tbl)
 280      int c;
 281      Lisp_Object rev_tbl;
 282 {
 283   struct charset *charset;
 284   unsigned c1;
 285
 286   if (CHAR_BYTE8_P (c))
 287     return CHAR_TO_BYTE8 (c);
 288   charset = CHARSET_FROM_ID (charset_unibyte);
 289   c1 = ENCODE_CHAR (charset, c);
 290   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 291 }
 292
 293 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 294    by charset_unibyte.  */
 295
 296 int
 297 multibyte_char_to_unibyte_safe (c)
 298      int c;
 299 {
 300   struct charset *charset;
 301   unsigned c1;
 302
 303   if (CHAR_BYTE8_P (c))
 304     return CHAR_TO_BYTE8 (c);
 305   charset = CHARSET_FROM_ID (charset_unibyte);
 306   c1 = ENCODE_CHAR (charset, c);
 307   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 308 }
 309
 310 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 311        doc: /* Return non-nil if OBJECT is a character.  */)
 312      (object, ignore)
 313      Lisp_Object object, ignore;
 314 {
 315   return (CHARACTERP (object) ? Qt : Qnil);
 316 }
 317
 318 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 319        doc: /* Return the character of the maximum code.  */)
 320      ()
 321 {
 322   return make_number (MAX_CHAR);
 323 }
 324
 325 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 326        Sunibyte_char_to_multibyte, 1, 1, 0,
 327        doc: /* Convert the byte CH to multibyte character.  */)
 328      (ch)
 329      Lisp_Object ch;
 330 {
 331   int c;
 332   struct charset *charset;
 333
 334   CHECK_CHARACTER (ch);
 335   c = XFASTINT (ch);
 336   if (c >= 0400)
 337     error ("Invalid unibyte character: %d", c);
 338   charset = CHARSET_FROM_ID (charset_unibyte);
 339   c = DECODE_CHAR (charset, c);
 340   if (c < 0)
 341     c = BYTE8_TO_CHAR (XFASTINT (ch));
 342   return make_number (c);
 343 }
 344
 345 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 346        Smultibyte_char_to_unibyte, 1, 1, 0,
 347        doc: /* Convert the multibyte character CH to a byte.
 348 If the multibyte character does not represent a byte, return -1.  */)
 349      (ch)
 350      Lisp_Object ch;
 351 {
 352   int cm;
 353
 354   CHECK_CHARACTER (ch);
 355   cm = XFASTINT (ch);
 356   if (cm < 256)
 357     /* Can't distinguish a byte read from a unibyte buffer from
 358        a latin1 char, so let's let it slide.  */
 359     return ch;
 360   else
 361     {
 362       int cu = CHAR_TO_BYTE_SAFE (cm);
 363       return make_number (cu);
 364     }
 365 }
 366
 367 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 368        doc: /* Return 1 regardless of the argument CHAR.
 369 This is now an obsolete function.  We keep it just for backward compatibility.
 370 usage: (char-bytes CHAR)  */)
 371      (ch)
 372      Lisp_Object ch;
 373 {
 374   CHECK_CHARACTER (ch);
 375   return make_number (1);
 376 }
 377
 378 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 379        doc: /* Return width of CHAR when displayed in the current buffer.
 380 The width is measured by how many columns it occupies on the screen.
 381 Tab is taken to occupy `tab-width' columns.
 382 usage: (char-width CHAR)  */)
 383      (ch)
 384        Lisp_Object ch;
 385 {
 386   Lisp_Object disp;
 387   int c, width;
 388   struct Lisp_Char_Table *dp = buffer_display_table ();
 389
 390   CHECK_CHARACTER (ch);
 391   c = XINT (ch);
 392
 393   /* Get the way the display table would display it.  */
 394   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 395
 396   if (VECTORP (disp))
 397     width = ASIZE (disp);
 398   else
 399     width = CHAR_WIDTH (c);
 400
 401   return make_number (width);
 402 }
 403
 404 /* Return width of string STR of length LEN when displayed in the
 405    current buffer.  The width is measured by how many columns it
 406    occupies on the screen.  If PRECISION > 0, return the width of
 407    longest substring that doesn't exceed PRECISION, and set number of
 408    characters and bytes of the substring in *NCHARS and *NBYTES
 409    respectively.  */
 410
 411 int
 412 c_string_width (str, len, precision, nchars, nbytes)
 413      const unsigned char *str;
 414      int precision, *nchars, *nbytes;
 415 {
 416   int i = 0, i_byte = 0;
 417   int width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i_byte < len)
 421     {
 422       int bytes, thiswidth;
 423       Lisp_Object val;
 424       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 425
 426       if (dp)
 427         {
 428           val = DISP_CHAR_VECTOR (dp, c);
 429           if (VECTORP (val))
 430             thiswidth = XVECTOR (val)->size;
 431           else
 432             thiswidth = CHAR_WIDTH (c);
 433         }
 434       else
 435         {
 436           thiswidth = CHAR_WIDTH (c);
 437         }
 438
 439       if (precision > 0
 440           && (width + thiswidth > precision))
 441         {
 442           *nchars = i;
 443           *nbytes = i_byte;
 444           return width;
 445         }
 446       i++;
 447       i_byte += bytes;
 448       width += thiswidth;
 449   }
 450
 451   if (precision > 0)
 452     {
 453       *nchars = i;
 454       *nbytes = i_byte;
 455     }
 456
 457   return width;
 458 }
 459
 460 /* Return width of string STR of length LEN when displayed in the
 461    current buffer.  The width is measured by how many columns it
 462    occupies on the screen.  */
 463
 464 int
 465 strwidth (str, len)
 466      unsigned char *str;
 467      int len;
 468 {
 469   return c_string_width (str, len, -1, NULL, NULL);
 470 }
 471
 472 /* Return width of Lisp string STRING when displayed in the current
 473    buffer.  The width is measured by how many columns it occupies on
 474    the screen while paying attention to compositions.  If PRECISION >
 475    0, return the width of longest substring that doesn't exceed
 476    PRECISION, and set number of characters and bytes of the substring
 477    in *NCHARS and *NBYTES respectively.  */
 478
 479 int
 480 lisp_string_width (string, precision, nchars, nbytes)
 481      Lisp_Object string;
 482      int precision, *nchars, *nbytes;
 483 {
 484   int len = SCHARS (string);
 485   /* This set multibyte to 0 even if STRING is multibyte when it
 486      contains only ascii and eight-bit-graphic, but that's
 487      intentional.  */
 488   int multibyte = len < SBYTES (string);
 489   unsigned char *str = SDATA (string);
 490   int i = 0, i_byte = 0;
 491   int width = 0;
 492   struct Lisp_Char_Table *dp = buffer_display_table ();
 493
 494   while (i < len)
 495     {
 496       int chars, bytes, thiswidth;
 497       Lisp_Object val;
 498       int cmp_id;
 499       EMACS_INT ignore, end;
 500
 501       if (find_composition (i, -1, &ignore, &end, &val, string)
 502           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 503               >= 0))
 504         {
 505           thiswidth = composition_table[cmp_id]->width;
 506           chars = end - i;
 507           bytes = string_char_to_byte (string, end) - i_byte;
 508         }
 509       else
 510         {
 511           int c;
 512
 513           if (multibyte)
 514             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 515           else
 516             c = str[i_byte], bytes = 1;
 517           chars = 1;
 518           if (dp)
 519             {
 520               val = DISP_CHAR_VECTOR (dp, c);
 521               if (VECTORP (val))
 522                 thiswidth = XVECTOR (val)->size;
 523               else
 524                 thiswidth = CHAR_WIDTH (c);
 525             }
 526           else
 527             {
 528               thiswidth = CHAR_WIDTH (c);
 529             }
 530         }
 531
 532       if (precision > 0
 533           && (width + thiswidth > precision))
 534         {
 535           *nchars = i;
 536           *nbytes = i_byte;
 537           return width;
 538         }
 539       i += chars;
 540       i_byte += bytes;
 541       width += thiswidth;
 542   }
 543
 544   if (precision > 0)
 545     {
 546       *nchars = i;
 547       *nbytes = i_byte;
 548     }
 549
 550   return width;
 551 }
 552
 553 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 554        doc: /* Return width of STRING when displayed in the current buffer.
 555 Width is measured by how many columns it occupies on the screen.
 556 When calculating width of a multibyte character in STRING,
 557 only the base leading-code is considered; the validity of
 558 the following bytes is not checked.  Tabs in STRING are always
 559 taken to occupy `tab-width' columns.
 560 usage: (string-width STRING)  */)
 561      (str)
 562      Lisp_Object str;
 563 {
 564   Lisp_Object val;
 565
 566   CHECK_STRING (str);
 567   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 568   return val;
 569 }
 570
 571 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 572        doc: /* Return the direction of CHAR.
 573 The returned value is 0 for left-to-right and 1 for right-to-left.
 574 usage: (char-direction CHAR)  */)
 575      (ch)
 576      Lisp_Object ch;
 577 {
 578   int c;
 579
 580   CHECK_CHARACTER (ch);
 581   c = XINT (ch);
 582   return CHAR_TABLE_REF (Vchar_direction_table, c);
 583 }
 584
 585 /* Return the number of characters in the NBYTES bytes at PTR.
 586    This works by looking at the contents and checking for multibyte
 587    sequences while assuming that there's no invalid sequence.
 588    However, if the current buffer has enable-multibyte-characters =
 589    nil, we treat each byte as a character.  */
 590
 591 EMACS_INT
 592 chars_in_text (ptr, nbytes)
 593      const unsigned char *ptr;
 594      EMACS_INT nbytes;
 595 {
 596   /* current_buffer is null at early stages of Emacs initialization.  */
 597   if (current_buffer == 0
 598       || NILP (current_buffer->enable_multibyte_characters))
 599     return nbytes;
 600
 601   return multibyte_chars_in_text (ptr, nbytes);
 602 }
 603
 604 /* Return the number of characters in the NBYTES bytes at PTR.
 605    This works by looking at the contents and checking for multibyte
 606    sequences while assuming that there's no invalid sequence.  It
 607    ignores enable-multibyte-characters.  */
 608
 609 EMACS_INT
 610 multibyte_chars_in_text (ptr, nbytes)
 611      const unsigned char *ptr;
 612      EMACS_INT nbytes;
 613 {
 614   const unsigned char *endp = ptr + nbytes;
 615   int chars = 0;
 616
 617   while (ptr < endp)
 618     {
 619       int len = MULTIBYTE_LENGTH (ptr, endp);
 620
 621       if (len == 0)
 622         abort ();
 623       ptr += len;
 624       chars++;
 625     }
 626
 627   return chars;
 628 }
 629
 630 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 631    characters and bytes in it, and store them in *NCHARS and *NBYTES
 632    respectively.  On counting bytes, pay attention to that 8-bit
 633    characters not constructing a valid multibyte sequence are
 634    represented by 2-byte in a multibyte text.  */
 635
 636 void
 637 parse_str_as_multibyte (str, len, nchars, nbytes)
 638      const unsigned char *str;
 639      int len, *nchars, *nbytes;
 640 {
 641   const unsigned char *endp = str + len;
 642   int n, chars = 0, bytes = 0;
 643
 644   if (len >= MAX_MULTIBYTE_LENGTH)
 645     {
 646       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 647       while (str < adjusted_endp)
 648         {
 649           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 650             str += n, bytes += n;
 651           else
 652             str++, bytes += 2;
 653           chars++;
 654         }
 655     }
 656   while (str < endp)
 657     {
 658       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 659         str += n, bytes += n;
 660       else
 661         str++, bytes += 2;
 662       chars++;
 663     }
 664
 665   *nchars = chars;
 666   *nbytes = bytes;
 667   return;
 668 }
 669
 670 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 671    It actually converts only such 8-bit characters that don't contruct
 672    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 673    NCHARS is nonzero, set *NCHARS to the number of characters in the
 674    text.  It is assured that we can use LEN bytes at STR as a work
 675    area and that is enough.  Return the number of bytes of the
 676    resulting text.  */
 677
 678 int
 679 str_as_multibyte (str, len, nbytes, nchars)
 680      unsigned char *str;
 681      int len, nbytes, *nchars;
 682 {
 683   unsigned char *p = str, *endp = str + nbytes;
 684   unsigned char *to;
 685   int chars = 0;
 686   int n;
 687
 688   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 689     {
 690       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 691       while (p < adjusted_endp
 692              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 693         p += n, chars++;
 694     }
 695   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 696     p += n, chars++;
 697   if (nchars)
 698     *nchars = chars;
 699   if (p == endp)
 700     return nbytes;
 701
 702   to = p;
 703   nbytes = endp - p;
 704   endp = str + len;
 705   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 706   p = endp - nbytes;
 707
 708   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 709     {
 710       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 711       while (p < adjusted_endp)
 712         {
 713           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 714             {
 715               while (n--)
 716                 *to++ = *p++;
 717             }
 718           else
 719             {
 720               int c = *p++;
 721               c = BYTE8_TO_CHAR (c);
 722               to += CHAR_STRING (c, to);
 723             }
 724         }
 725       chars++;
 726     }
 727   while (p < endp)
 728     {
 729       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 730         {
 731           while (n--)
 732             *to++ = *p++;
 733         }
 734       else
 735         {
 736           int c = *p++;
 737           c = BYTE8_TO_CHAR (c);
 738           to += CHAR_STRING (c, to);
 739         }
 740       chars++;
 741     }
 742   if (nchars)
 743     *nchars = chars;
 744   return (to - str);
 745 }
 746
 747 /* Parse unibyte string at STR of LEN bytes, and return the number of
 748    bytes it may ocupy when converted to multibyte string by
 749    `str_to_multibyte'.  */
 750
 751 int
 752 parse_str_to_multibyte (str, len)
 753      unsigned char *str;
 754      int len;
 755 {
 756   unsigned char *endp = str + len;
 757   int bytes;
 758
 759   for (bytes = 0; str < endp; str++)
 760     bytes += (*str < 0x80) ? 1 : 2;
 761   return bytes;
 762 }
 763
 764
 765 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 766    that contains the same single-byte characters.  It actually
 767    converts all 8-bit characters to multibyte forms.  It is assured
 768    that we can use LEN bytes at STR as a work area and that is
 769    enough.  */
 770
 771 int
 772 str_to_multibyte (str, len, bytes)
 773      unsigned char *str;
 774      int len, bytes;
 775 {
 776   unsigned char *p = str, *endp = str + bytes;
 777   unsigned char *to;
 778
 779   while (p < endp && *p < 0x80) p++;
 780   if (p == endp)
 781     return bytes;
 782   to = p;
 783   bytes = endp - p;
 784   endp = str + len;
 785   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 786   p = endp - bytes;
 787   while (p < endp)
 788     {
 789       int c = *p++;
 790
 791       if (c >= 0x80)
 792         c = BYTE8_TO_CHAR (c);
 793       to += CHAR_STRING (c, to);
 794     }
 795   return (to - str);
 796 }
 797
 798 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 799    actually converts characters in the range 0x80..0xFF to
 800    unibyte.  */
 801
 802 int
 803 str_as_unibyte (str, bytes)
 804      unsigned char *str;
 805      int bytes;
 806 {
 807   const unsigned char *p = str, *endp = str + bytes;
 808   unsigned char *to;
 809   int c, len;
 810
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         break;
 817       p += len;
 818     }
 819   to = str + (p - str);
 820   while (p < endp)
 821     {
 822       c = *p;
 823       len = BYTES_BY_CHAR_HEAD (c);
 824       if (CHAR_BYTE8_HEAD_P (c))
 825         {
 826           c = STRING_CHAR_ADVANCE (p);
 827           *to++ = CHAR_TO_BYTE8 (c);
 828         }
 829       else
 830         {
 831           while (len--) *to++ = *p++;
 832         }
 833     }
 834   return (to - str);
 835 }
 836
 837 /* Convert eight-bit chars in SRC (in multibyte form) to the
 838    corresponding byte and store in DST.  CHARS is the number of
 839    characters in SRC.  The value is the number of bytes stored in DST.
 840    Usually, the value is the same as CHARS, but is less than it if SRC
 841    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 842    is nonzero, a Latin-1 character is accepted and converted to a byte
 843    of that character code. */
 844
 845 EMACS_INT
 846 str_to_unibyte (src, dst, chars, accept_latin_1)
 847      const unsigned char *src;
 848      unsigned char *dst;
 849      EMACS_INT chars;
 850      int accept_latin_1;
 851 {
 852   EMACS_INT i;
 853
 854   for (i = 0; i < chars; i++)
 855     {
 856       int c = STRING_CHAR_ADVANCE (src);
 857
 858       if (CHAR_BYTE8_P (c))
 859         c = CHAR_TO_BYTE8 (c);
 860       else if (! ASCII_CHAR_P (c)
 861                && (! accept_latin_1 || c >= 0x100))
 862         return i;
 863       *dst++ = c;
 864     }
 865   return i;
 866 }
 867
 868
 869 int
 870 string_count_byte8 (string)
 871      Lisp_Object string;
 872 {
 873   int multibyte = STRING_MULTIBYTE (string);
 874   int nbytes = SBYTES (string);
 875   unsigned char *p = SDATA (string);
 876   unsigned char *pend = p + nbytes;
 877   int count = 0;
 878   int c, len;
 879
 880   if (multibyte)
 881     while (p < pend)
 882       {
 883         c = *p;
 884         len = BYTES_BY_CHAR_HEAD (c);
 885
 886         if (CHAR_BYTE8_HEAD_P (c))
 887           count++;
 888         p += len;
 889       }
 890   else
 891     while (p < pend)
 892       {
 893         if (*p++ >= 0x80)
 894           count++;
 895       }
 896   return count;
 897 }
 898
 899
 900 Lisp_Object
 901 string_escape_byte8 (string)
 902      Lisp_Object string;
 903 {
 904   int nchars = SCHARS (string);
 905   int nbytes = SBYTES (string);
 906   int multibyte = STRING_MULTIBYTE (string);
 907   int byte8_count;
 908   const unsigned char *src, *src_end;
 909   unsigned char *dst;
 910   Lisp_Object val;
 911   int c, len;
 912
 913   if (multibyte && nchars == nbytes)
 914     return string;
 915
 916   byte8_count = string_count_byte8 (string);
 917
 918   if (byte8_count == 0)
 919     return string;
 920
 921   if (multibyte)
 922     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 923     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 924                                         nbytes + byte8_count * 2);
 925   else
 926     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 927     val = make_uninit_string (nbytes + byte8_count * 3);
 928
 929   src = SDATA (string);
 930   src_end = src + nbytes;
 931   dst = SDATA (val);
 932   if (multibyte)
 933     while (src < src_end)
 934       {
 935         c = *src;
 936         len = BYTES_BY_CHAR_HEAD (c);
 937
 938         if (CHAR_BYTE8_HEAD_P (c))
 939           {
 940             c = STRING_CHAR_ADVANCE (src);
 941             c = CHAR_TO_BYTE8 (c);
 942             sprintf ((char *) dst, "\\%03o", c);
 943             dst += 4;
 944           }
 945         else
 946           while (len--) *dst++ = *src++;
 947       }
 948   else
 949     while (src < src_end)
 950       {
 951         c = *src++;
 952         if (c >= 0x80)
 953           {
 954             sprintf ((char *) dst, "\\%03o", c);
 955             dst += 4;
 956           }
 957         else
 958           *dst++ = c;
 959       }
 960   return val;
 961 }
 962
 963 \f
 964 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 965        doc: /*
 966 Concatenate all the argument characters and make the result a string.
 967 usage: (string &rest CHARACTERS)  */)
 968      (n, args)
 969      int n;
 970      Lisp_Object *args;
 971 {
 972   int i;
 973   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 974   unsigned char *p = buf;
 975   int c;
 976
 977   for (i = 0; i < n; i++)
 978     {
 979       CHECK_CHARACTER (args[i]);
 980       c = XINT (args[i]);
 981       p += CHAR_STRING (c, p);
 982     }
 983
 984   return make_string_from_bytes ((char *) buf, n, p - buf);
 985 }
 986
 987 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 988        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 989 usage: (unibyte-string &rest BYTES)  */)
 990      (n, args)
 991      int n;
 992      Lisp_Object *args;
 993 {
 994   int i;
 995   unsigned char *buf = (unsigned char *) alloca (n);
 996   unsigned char *p = buf;
 997   unsigned c;
 998
 999   for (i = 0; i < n; i++)
1000     {
1001       CHECK_NATNUM (args[i]);
1002       c = XFASTINT (args[i]);
1003       if (c >= 256)
1004         args_out_of_range_3 (args[i], make_number (0), make_number (255));
1005       *p++ = c;
1006     }
1007
1008   return make_string_from_bytes ((char *) buf, n, p - buf);
1009 }
1010
1011 DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
1012        Schar_resolve_modifiers, 1, 1, 0,
1013        doc: /* Resolve modifiers in the character CHAR.
1014 The value is a character with modifiers resolved into the character
1015 code.  Unresolved modifiers are kept in the value.
1016 usage: (char-resolve-modifers CHAR)  */)
1017      (character)
1018      Lisp_Object character;
1019 {
1020   int c;
1021
1022   CHECK_NUMBER (character);
1023   c = XINT (character);
1024   return make_number (char_resolve_modifier_mask (c));
1025 }
1026
1027 void
1028 init_character_once ()
1029 {
1030 }
1031
1032 #ifdef emacs
1033
1034 void
1035 syms_of_character ()
1036 {
1037   DEFSYM (Qcharacterp, "characterp");
1038   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1039
1040   staticpro (&Vchar_unify_table);
1041   Vchar_unify_table = Qnil;
1042
1043   defsubr (&Smax_char);
1044   defsubr (&Scharacterp);
1045   defsubr (&Sunibyte_char_to_multibyte);
1046   defsubr (&Smultibyte_char_to_unibyte);
1047   defsubr (&Schar_bytes);
1048   defsubr (&Schar_width);
1049   defsubr (&Sstring_width);
1050   defsubr (&Schar_direction);
1051   defsubr (&Sstring);
1052   defsubr (&Sunibyte_string);
1053   defsubr (&Schar_resolve_modifiers);
1054
1055   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1056                doc: /*
1057 Vector recording all translation tables ever defined.
1058 Each element is a pair (SYMBOL . TABLE) relating the table to the
1059 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1060   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1061
1062   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1063                doc: /*
1064 A char-table for characters which invoke auto-filling.
1065 Such characters have value t in this table.  */);
1066   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1067   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1068   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1069
1070   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1071                doc: /*
1072 A char-table for width (columns) of each character.  */);
1073   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1074   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1075   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1076                         make_number (4));
1077
1078   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1079                doc: /* A char-table for direction of each character.  */);
1080   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1081
1082   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1083                doc: /* A char-table for each printable character.  */);
1084   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1085   Fset_char_table_range (Vprintable_chars,
1086                          Fcons (make_number (32), make_number (126)), Qt);
1087   Fset_char_table_range (Vprintable_chars,
1088                          Fcons (make_number (160),
1089                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1090
1091   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1092                doc: /* Char table of script symbols.
1093 It has one extra slot whose value is a list of script symbols.  */);
1094
1095   /* Intern this now in case it isn't already done.
1096      Setting this variable twice is harmless.
1097      But don't staticpro it here--that is done in alloc.c.  */
1098   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1099   DEFSYM (Qchar_script_table, "char-script-table");
1100   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1101   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1102
1103   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1104                doc: /* Alist of scripts vs the representative characters.  */);
1105   Vscript_representative_chars = Qnil;
1106 }
1107
1108 #endif /* emacs */
1109
1110 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1111    (do not change this comment) */