src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2011  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 Lisp_Object Qauto_fill_chars;
  54
  55 /* Char-table of information about which character to unify to which
  56    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  57 Lisp_Object Vchar_unify_table;
  58
  59 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  60 unsigned char *_fetch_multibyte_char_p;
  61
  62 static Lisp_Object Qchar_script_table;
  63
  64 \f
  65
  66 /* If character code C has modifier masks, reflect them to the
  67    character code if possible.  Return the resulting code.  */
  68
  69 int
  70 char_resolve_modifier_mask (int c)
  71 {
  72   /* A non-ASCII character can't reflect modifier bits to the code.  */
  73   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  74     return c;
  75
  76   /* For Meta, Shift, and Control modifiers, we need special care.  */
  77   if (c & CHAR_SHIFT)
  78     {
  79       /* Shift modifier is valid only with [A-Za-z].  */
  80       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  81         c &= ~CHAR_SHIFT;
  82       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  83         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  84       /* Shift modifier for control characters and SPC is ignored.  */
  85       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  86         c &= ~CHAR_SHIFT;
  87     }
  88   if (c & CHAR_CTL)
  89     {
  90       /* Simulate the code in lread.c.  */
  91       /* Allow `\C- ' and `\C-?'.  */
  92       if ((c & 0377) == ' ')
  93         c &= ~0177 & ~ CHAR_CTL;
  94       else if ((c & 0377) == '?')
  95         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  96       /* ASCII control chars are made from letters (both cases),
  97          as well as the non-letters within 0100...0137.  */
  98       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 101         c &= (037 | (~0177 & ~CHAR_CTL));
 102     }
 103 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 104   if (c & CHAR_META)
 105     {
 106       /* Move the meta bit to the right place for a string.  */
 107       c = (c & ~CHAR_META) | 0x80;
 108     }
 109 #endif
 110
 111   return c;
 112 }
 113
 114
 115 /* Store multibyte form of character C at P.  If C has modifier bits,
 116    handle them appropriately.  */
 117
 118 int
 119 char_string (unsigned int c, unsigned char *p)
 120 {
 121   int bytes;
 122
 123   if (c & CHAR_MODIFIER_MASK)
 124     {
 125       c = (unsigned) char_resolve_modifier_mask ((int) c);
 126       /* If C still has any modifier bits, just ignore it.  */
 127       c &= ~CHAR_MODIFIER_MASK;
 128     }
 129
 130   MAYBE_UNIFY_CHAR (c);
 131
 132   if (c <= MAX_3_BYTE_CHAR)
 133     {
 134       bytes = CHAR_STRING (c, p);
 135     }
 136   else if (c <= MAX_4_BYTE_CHAR)
 137     {
 138       p[0] = (0xF0 | (c >> 18));
 139       p[1] = (0x80 | ((c >> 12) & 0x3F));
 140       p[2] = (0x80 | ((c >> 6) & 0x3F));
 141       p[3] = (0x80 | (c & 0x3F));
 142       bytes = 4;
 143     }
 144   else if (c <= MAX_5_BYTE_CHAR)
 145     {
 146       p[0] = 0xF8;
 147       p[1] = (0x80 | ((c >> 18) & 0x0F));
 148       p[2] = (0x80 | ((c >> 12) & 0x3F));
 149       p[3] = (0x80 | ((c >> 6) & 0x3F));
 150       p[4] = (0x80 | (c & 0x3F));
 151       bytes = 5;
 152     }
 153   else if (c <= MAX_CHAR)
 154     {
 155       c = CHAR_TO_BYTE8 (c);
 156       bytes = BYTE8_STRING (c, p);
 157     }
 158   else
 159     error ("Invalid character: %d", c);
 160
 161   return bytes;
 162 }
 163
 164
 165 /* Return a character whose multibyte form is at P.  Set LEN is not
 166    NULL, it must be a pointer to integer.  In that case, set *LEN to
 167    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 168    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 169    the ending address (i.e. the starting address of the next
 170    character) of the multibyte form.  */
 171
 172 int
 173 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 174 {
 175   int c;
 176   const unsigned char *saved_p = p;
 177
 178   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 179     {
 180       c = STRING_CHAR_ADVANCE (p);
 181     }
 182   else if (! (*p & 0x08))
 183     {
 184       c = ((((p)[0] & 0xF) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       c = ((((p)[1] & 0x3F) << 18)
 193            | (((p)[2] & 0x3F) << 12)
 194            | (((p)[3] & 0x3F) << 6)
 195            | ((p)[4] & 0x3F));
 196       p += 5;
 197     }
 198
 199   MAYBE_UNIFY_CHAR (c);
 200
 201   if (len)
 202     *len = p - saved_p;
 203   if (advanced)
 204     *advanced = p;
 205   return c;
 206 }
 207
 208
 209 /* Translate character C by translation table TABLE.  If C is
 210    negative, translate a character specified by CHARSET and CODE.  If
 211    no translation is found in TABLE, return the untranslated
 212    character.  If TABLE is a list, elements are char tables.  In this
 213    case, translace C by all tables.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.  */)
 262   (Lisp_Object object, Lisp_Object ignore)
 263 {
 264   return (CHARACTERP (object) ? Qt : Qnil);
 265 }
 266
 267 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 268        doc: /* Return the character of the maximum code.  */)
 269   (void)
 270 {
 271   return make_number (MAX_CHAR);
 272 }
 273
 274 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 275        Sunibyte_char_to_multibyte, 1, 1, 0,
 276        doc: /* Convert the byte CH to multibyte character.  */)
 277   (Lisp_Object ch)
 278 {
 279   int c;
 280
 281   CHECK_CHARACTER (ch);
 282   c = XFASTINT (ch);
 283   if (c >= 0x100)
 284     error ("Not a unibyte character: %d", c);
 285   MAKE_CHAR_MULTIBYTE (c);
 286   return make_number (c);
 287 }
 288
 289 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 290        Smultibyte_char_to_unibyte, 1, 1, 0,
 291        doc: /* Convert the multibyte character CH to a byte.
 292 If the multibyte character does not represent a byte, return -1.  */)
 293   (Lisp_Object ch)
 294 {
 295   int cm;
 296
 297   CHECK_CHARACTER (ch);
 298   cm = XFASTINT (ch);
 299   if (cm < 256)
 300     /* Can't distinguish a byte read from a unibyte buffer from
 301        a latin1 char, so let's let it slide.  */
 302     return ch;
 303   else
 304     {
 305       int cu = CHAR_TO_BYTE_SAFE (cm);
 306       return make_number (cu);
 307     }
 308 }
 309
 310 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 311        doc: /* Return width of CHAR when displayed in the current buffer.
 312 The width is measured by how many columns it occupies on the screen.
 313 Tab is taken to occupy `tab-width' columns.
 314 usage: (char-width CHAR)  */)
 315   (Lisp_Object ch)
 316 {
 317   Lisp_Object disp;
 318   int c, width;
 319   struct Lisp_Char_Table *dp = buffer_display_table ();
 320
 321   CHECK_CHARACTER (ch);
 322   c = XINT (ch);
 323
 324   /* Get the way the display table would display it.  */
 325   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 326
 327   if (VECTORP (disp))
 328     width = ASIZE (disp);
 329   else
 330     width = CHAR_WIDTH (c);
 331
 332   return make_number (width);
 333 }
 334
 335 /* Return width of string STR of length LEN when displayed in the
 336    current buffer.  The width is measured by how many columns it
 337    occupies on the screen.  If PRECISION > 0, return the width of
 338    longest substring that doesn't exceed PRECISION, and set number of
 339    characters and bytes of the substring in *NCHARS and *NBYTES
 340    respectively.  */
 341
 342 EMACS_INT
 343 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 344                 EMACS_INT *nchars, EMACS_INT *nbytes)
 345 {
 346   EMACS_INT i = 0, i_byte = 0;
 347   EMACS_INT width = 0;
 348   struct Lisp_Char_Table *dp = buffer_display_table ();
 349
 350   while (i_byte < len)
 351     {
 352       int bytes, thiswidth;
 353       Lisp_Object val;
 354       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 355
 356       if (dp)
 357         {
 358           val = DISP_CHAR_VECTOR (dp, c);
 359           if (VECTORP (val))
 360             thiswidth = XVECTOR (val)->size;
 361           else
 362             thiswidth = CHAR_WIDTH (c);
 363         }
 364       else
 365         {
 366           thiswidth = CHAR_WIDTH (c);
 367         }
 368
 369       if (precision > 0
 370           && (width + thiswidth > precision))
 371         {
 372           *nchars = i;
 373           *nbytes = i_byte;
 374           return width;
 375         }
 376       i++;
 377       i_byte += bytes;
 378       width += thiswidth;
 379   }
 380
 381   if (precision > 0)
 382     {
 383       *nchars = i;
 384       *nbytes = i_byte;
 385     }
 386
 387   return width;
 388 }
 389
 390 /* Return width of string STR of length LEN when displayed in the
 391    current buffer.  The width is measured by how many columns it
 392    occupies on the screen.  */
 393
 394 EMACS_INT
 395 strwidth (const char *str, EMACS_INT len)
 396 {
 397   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 398 }
 399
 400 /* Return width of Lisp string STRING when displayed in the current
 401    buffer.  The width is measured by how many columns it occupies on
 402    the screen while paying attention to compositions.  If PRECISION >
 403    0, return the width of longest substring that doesn't exceed
 404    PRECISION, and set number of characters and bytes of the substring
 405    in *NCHARS and *NBYTES respectively.  */
 406
 407 EMACS_INT
 408 lisp_string_width (Lisp_Object string, int precision,
 409                    EMACS_INT *nchars, EMACS_INT *nbytes)
 410 {
 411   EMACS_INT len = SCHARS (string);
 412   /* This set multibyte to 0 even if STRING is multibyte when it
 413      contains only ascii and eight-bit-graphic, but that's
 414      intentional.  */
 415   int multibyte = len < SBYTES (string);
 416   unsigned char *str = SDATA (string);
 417   EMACS_INT i = 0, i_byte = 0;
 418   EMACS_INT width = 0;
 419   struct Lisp_Char_Table *dp = buffer_display_table ();
 420
 421   while (i < len)
 422     {
 423       int chars, bytes, thiswidth;
 424       Lisp_Object val;
 425       int cmp_id;
 426       EMACS_INT ignore, end;
 427
 428       if (find_composition (i, -1, &ignore, &end, &val, string)
 429           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 430               >= 0))
 431         {
 432           thiswidth = composition_table[cmp_id]->width;
 433           chars = end - i;
 434           bytes = string_char_to_byte (string, end) - i_byte;
 435         }
 436       else
 437         {
 438           int c;
 439
 440           if (multibyte)
 441             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 442           else
 443             c = str[i_byte], bytes = 1;
 444           chars = 1;
 445           if (dp)
 446             {
 447               val = DISP_CHAR_VECTOR (dp, c);
 448               if (VECTORP (val))
 449                 thiswidth = XVECTOR (val)->size;
 450               else
 451                 thiswidth = CHAR_WIDTH (c);
 452             }
 453           else
 454             {
 455               thiswidth = CHAR_WIDTH (c);
 456             }
 457         }
 458
 459       if (precision > 0
 460           && (width + thiswidth > precision))
 461         {
 462           *nchars = i;
 463           *nbytes = i_byte;
 464           return width;
 465         }
 466       i += chars;
 467       i_byte += bytes;
 468       width += thiswidth;
 469   }
 470
 471   if (precision > 0)
 472     {
 473       *nchars = i;
 474       *nbytes = i_byte;
 475     }
 476
 477   return width;
 478 }
 479
 480 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 481        doc: /* Return width of STRING when displayed in the current buffer.
 482 Width is measured by how many columns it occupies on the screen.
 483 When calculating width of a multibyte character in STRING,
 484 only the base leading-code is considered; the validity of
 485 the following bytes is not checked.  Tabs in STRING are always
 486 taken to occupy `tab-width' columns.
 487 usage: (string-width STRING)  */)
 488   (Lisp_Object str)
 489 {
 490   Lisp_Object val;
 491
 492   CHECK_STRING (str);
 493   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 494   return val;
 495 }
 496
 497 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 498        doc: /* Return the direction of CHAR.
 499 The returned value is 0 for left-to-right and 1 for right-to-left.
 500 usage: (char-direction CHAR)  */)
 501   (Lisp_Object ch)
 502 {
 503   int c;
 504
 505   CHECK_CHARACTER (ch);
 506   c = XINT (ch);
 507   return CHAR_TABLE_REF (Vchar_direction_table, c);
 508 }
 509
 510 /* Return the number of characters in the NBYTES bytes at PTR.
 511    This works by looking at the contents and checking for multibyte
 512    sequences while assuming that there's no invalid sequence.
 513    However, if the current buffer has enable-multibyte-characters =
 514    nil, we treat each byte as a character.  */
 515
 516 EMACS_INT
 517 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 518 {
 519   /* current_buffer is null at early stages of Emacs initialization.  */
 520   if (current_buffer == 0
 521       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 522     return nbytes;
 523
 524   return multibyte_chars_in_text (ptr, nbytes);
 525 }
 526
 527 /* Return the number of characters in the NBYTES bytes at PTR.
 528    This works by looking at the contents and checking for multibyte
 529    sequences while assuming that there's no invalid sequence.  It
 530    ignores enable-multibyte-characters.  */
 531
 532 EMACS_INT
 533 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 534 {
 535   const unsigned char *endp = ptr + nbytes;
 536   EMACS_INT chars = 0;
 537
 538   while (ptr < endp)
 539     {
 540       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 541
 542       if (len == 0)
 543         abort ();
 544       ptr += len;
 545       chars++;
 546     }
 547
 548   return chars;
 549 }
 550
 551 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 552    characters and bytes in it, and store them in *NCHARS and *NBYTES
 553    respectively.  On counting bytes, pay attention to that 8-bit
 554    characters not constructing a valid multibyte sequence are
 555    represented by 2-byte in a multibyte text.  */
 556
 557 void
 558 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 559                         EMACS_INT *nchars, EMACS_INT *nbytes)
 560 {
 561   const unsigned char *endp = str + len;
 562   EMACS_INT n, chars = 0, bytes = 0;
 563
 564   if (len >= MAX_MULTIBYTE_LENGTH)
 565     {
 566       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 567       while (str < adjusted_endp)
 568         {
 569           if (! CHAR_BYTE8_HEAD_P (*str)
 570               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 571             str += n, bytes += n;
 572           else
 573             str++, bytes += 2;
 574           chars++;
 575         }
 576     }
 577   while (str < endp)
 578     {
 579       if (! CHAR_BYTE8_HEAD_P (*str)
 580           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 581         str += n, bytes += n;
 582       else
 583         str++, bytes += 2;
 584       chars++;
 585     }
 586
 587   *nchars = chars;
 588   *nbytes = bytes;
 589   return;
 590 }
 591
 592 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 593    It actually converts only such 8-bit characters that don't contruct
 594    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 595    NCHARS is nonzero, set *NCHARS to the number of characters in the
 596    text.  It is assured that we can use LEN bytes at STR as a work
 597    area and that is enough.  Return the number of bytes of the
 598    resulting text.  */
 599
 600 EMACS_INT
 601 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 602                   EMACS_INT *nchars)
 603 {
 604   unsigned char *p = str, *endp = str + nbytes;
 605   unsigned char *to;
 606   EMACS_INT chars = 0;
 607   int n;
 608
 609   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 610     {
 611       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 612       while (p < adjusted_endp
 613              && ! CHAR_BYTE8_HEAD_P (*p)
 614              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 615         p += n, chars++;
 616     }
 617   while (p < endp
 618          && ! CHAR_BYTE8_HEAD_P (*p)
 619          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 620     p += n, chars++;
 621   if (nchars)
 622     *nchars = chars;
 623   if (p == endp)
 624     return nbytes;
 625
 626   to = p;
 627   nbytes = endp - p;
 628   endp = str + len;
 629   memmove (endp - nbytes, p, nbytes);
 630   p = endp - nbytes;
 631
 632   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 633     {
 634       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 635       while (p < adjusted_endp)
 636         {
 637           if (! CHAR_BYTE8_HEAD_P (*p)
 638               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 639             {
 640               while (n--)
 641                 *to++ = *p++;
 642             }
 643           else
 644             {
 645               int c = *p++;
 646               c = BYTE8_TO_CHAR (c);
 647               to += CHAR_STRING (c, to);
 648             }
 649         }
 650       chars++;
 651     }
 652   while (p < endp)
 653     {
 654       if (! CHAR_BYTE8_HEAD_P (*p)
 655           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 656         {
 657           while (n--)
 658             *to++ = *p++;
 659         }
 660       else
 661         {
 662           int c = *p++;
 663           c = BYTE8_TO_CHAR (c);
 664           to += CHAR_STRING (c, to);
 665         }
 666       chars++;
 667     }
 668   if (nchars)
 669     *nchars = chars;
 670   return (to - str);
 671 }
 672
 673 /* Parse unibyte string at STR of LEN bytes, and return the number of
 674    bytes it may ocupy when converted to multibyte string by
 675    `str_to_multibyte'.  */
 676
 677 EMACS_INT
 678 parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
 679 {
 680   const unsigned char *endp = str + len;
 681   EMACS_INT bytes;
 682
 683   for (bytes = 0; str < endp; str++)
 684     bytes += (*str < 0x80) ? 1 : 2;
 685   return bytes;
 686 }
 687
 688
 689 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 690    that contains the same single-byte characters.  It actually
 691    converts all 8-bit characters to multibyte forms.  It is assured
 692    that we can use LEN bytes at STR as a work area and that is
 693    enough.  */
 694
 695 EMACS_INT
 696 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 697 {
 698   unsigned char *p = str, *endp = str + bytes;
 699   unsigned char *to;
 700
 701   while (p < endp && *p < 0x80) p++;
 702   if (p == endp)
 703     return bytes;
 704   to = p;
 705   bytes = endp - p;
 706   endp = str + len;
 707   memmove (endp - bytes, p, bytes);
 708   p = endp - bytes;
 709   while (p < endp)
 710     {
 711       int c = *p++;
 712
 713       if (c >= 0x80)
 714         c = BYTE8_TO_CHAR (c);
 715       to += CHAR_STRING (c, to);
 716     }
 717   return (to - str);
 718 }
 719
 720 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 721    actually converts characters in the range 0x80..0xFF to
 722    unibyte.  */
 723
 724 EMACS_INT
 725 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 726 {
 727   const unsigned char *p = str, *endp = str + bytes;
 728   unsigned char *to;
 729   int c, len;
 730
 731   while (p < endp)
 732     {
 733       c = *p;
 734       len = BYTES_BY_CHAR_HEAD (c);
 735       if (CHAR_BYTE8_HEAD_P (c))
 736         break;
 737       p += len;
 738     }
 739   to = str + (p - str);
 740   while (p < endp)
 741     {
 742       c = *p;
 743       len = BYTES_BY_CHAR_HEAD (c);
 744       if (CHAR_BYTE8_HEAD_P (c))
 745         {
 746           c = STRING_CHAR_ADVANCE (p);
 747           *to++ = CHAR_TO_BYTE8 (c);
 748         }
 749       else
 750         {
 751           while (len--) *to++ = *p++;
 752         }
 753     }
 754   return (to - str);
 755 }
 756
 757 /* Convert eight-bit chars in SRC (in multibyte form) to the
 758    corresponding byte and store in DST.  CHARS is the number of
 759    characters in SRC.  The value is the number of bytes stored in DST.
 760    Usually, the value is the same as CHARS, but is less than it if SRC
 761    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 762    is nonzero, a Latin-1 character is accepted and converted to a byte
 763    of that character code.
 764    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 765
 766 EMACS_INT
 767 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 768 {
 769   EMACS_INT i;
 770
 771   for (i = 0; i < chars; i++)
 772     {
 773       int c = STRING_CHAR_ADVANCE (src);
 774
 775       if (CHAR_BYTE8_P (c))
 776         c = CHAR_TO_BYTE8 (c);
 777       else if (! ASCII_CHAR_P (c)
 778                && (! accept_latin_1 || c >= 0x100))
 779         return i;
 780       *dst++ = c;
 781     }
 782   return i;
 783 }
 784
 785
 786 static EMACS_INT
 787 string_count_byte8 (Lisp_Object string)
 788 {
 789   int multibyte = STRING_MULTIBYTE (string);
 790   EMACS_INT nbytes = SBYTES (string);
 791   unsigned char *p = SDATA (string);
 792   unsigned char *pend = p + nbytes;
 793   EMACS_INT count = 0;
 794   int c, len;
 795
 796   if (multibyte)
 797     while (p < pend)
 798       {
 799         c = *p;
 800         len = BYTES_BY_CHAR_HEAD (c);
 801
 802         if (CHAR_BYTE8_HEAD_P (c))
 803           count++;
 804         p += len;
 805       }
 806   else
 807     while (p < pend)
 808       {
 809         if (*p++ >= 0x80)
 810           count++;
 811       }
 812   return count;
 813 }
 814
 815
 816 Lisp_Object
 817 string_escape_byte8 (Lisp_Object string)
 818 {
 819   EMACS_INT nchars = SCHARS (string);
 820   EMACS_INT nbytes = SBYTES (string);
 821   int multibyte = STRING_MULTIBYTE (string);
 822   EMACS_INT byte8_count;
 823   const unsigned char *src, *src_end;
 824   unsigned char *dst;
 825   Lisp_Object val;
 826   int c, len;
 827
 828   if (multibyte && nchars == nbytes)
 829     return string;
 830
 831   byte8_count = string_count_byte8 (string);
 832
 833   if (byte8_count == 0)
 834     return string;
 835
 836   if (multibyte)
 837     {
 838       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 839           || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
 840         error ("Maximum string size exceeded");
 841
 842       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 843       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 844                                           nbytes + byte8_count * 2);
 845     }
 846   else
 847     {
 848       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
 849         error ("Maximum string size exceeded");
 850       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 851       val = make_uninit_string (nbytes + byte8_count * 3);
 852     }
 853
 854   src = SDATA (string);
 855   src_end = src + nbytes;
 856   dst = SDATA (val);
 857   if (multibyte)
 858     while (src < src_end)
 859       {
 860         c = *src;
 861         len = BYTES_BY_CHAR_HEAD (c);
 862
 863         if (CHAR_BYTE8_HEAD_P (c))
 864           {
 865             c = STRING_CHAR_ADVANCE (src);
 866             c = CHAR_TO_BYTE8 (c);
 867             sprintf ((char *) dst, "\\%03o", c);
 868             dst += 4;
 869           }
 870         else
 871           while (len--) *dst++ = *src++;
 872       }
 873   else
 874     while (src < src_end)
 875       {
 876         c = *src++;
 877         if (c >= 0x80)
 878           {
 879             sprintf ((char *) dst, "\\%03o", c);
 880             dst += 4;
 881           }
 882         else
 883           *dst++ = c;
 884       }
 885   return val;
 886 }
 887
 888 \f
 889 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 890        doc: /*
 891 Concatenate all the argument characters and make the result a string.
 892 usage: (string &rest CHARACTERS)  */)
 893   (size_t n, Lisp_Object *args)
 894 {
 895   size_t i;
 896   int c;
 897   unsigned char *buf, *p;
 898   Lisp_Object str;
 899   USE_SAFE_ALLOCA;
 900
 901   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 902   p = buf;
 903
 904   for (i = 0; i < n; i++)
 905     {
 906       CHECK_CHARACTER (args[i]);
 907       c = XINT (args[i]);
 908       p += CHAR_STRING (c, p);
 909     }
 910
 911   str = make_string_from_bytes ((char *) buf, n, p - buf);
 912   SAFE_FREE ();
 913   return str;
 914 }
 915
 916 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 917        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 918 usage: (unibyte-string &rest BYTES)  */)
 919   (size_t n, Lisp_Object *args)
 920 {
 921   size_t i;
 922   int c;
 923   unsigned char *buf, *p;
 924   Lisp_Object str;
 925   USE_SAFE_ALLOCA;
 926
 927   SAFE_ALLOCA (buf, unsigned char *, n);
 928   p = buf;
 929
 930   for (i = 0; i < n; i++)
 931     {
 932       CHECK_NATNUM (args[i]);
 933       c = XFASTINT (args[i]);
 934       if (c >= 256)
 935         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 936       *p++ = c;
 937     }
 938
 939   str = make_string_from_bytes ((char *) buf, n, p - buf);
 940   SAFE_FREE ();
 941   return str;
 942 }
 943
 944 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 945        Schar_resolve_modifiers, 1, 1, 0,
 946        doc: /* Resolve modifiers in the character CHAR.
 947 The value is a character with modifiers resolved into the character
 948 code.  Unresolved modifiers are kept in the value.
 949 usage: (char-resolve-modifiers CHAR)  */)
 950   (Lisp_Object character)
 951 {
 952   int c;
 953
 954   CHECK_NUMBER (character);
 955   c = XINT (character);
 956   return make_number (char_resolve_modifier_mask (c));
 957 }
 958
 959 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 960        doc: /* Return a byte value of a character at point.
 961 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 962 a byte value.
 963 Optional 2nd arg STRING, if non-nil, is a string of which first
 964 character is a target to get a byte value.  In this case, POSITION, if
 965 non-nil, is an index of a target character in the string.
 966
 967 If the current buffer (or STRING) is multibyte, and the target
 968 character is not ASCII nor 8-bit character, an error is signalled.  */)
 969   (Lisp_Object position, Lisp_Object string)
 970 {
 971   int c;
 972   EMACS_INT pos;
 973   unsigned char *p;
 974
 975   if (NILP (string))
 976     {
 977       if (NILP (position))
 978         {
 979           p = PT_ADDR;
 980         }
 981       else
 982         {
 983           CHECK_NUMBER_COERCE_MARKER (position);
 984           if (XINT (position) < BEGV || XINT (position) >= ZV)
 985             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 986           pos = XFASTINT (position);
 987           p = CHAR_POS_ADDR (pos);
 988         }
 989       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 990         return make_number (*p);
 991     }
 992   else
 993     {
 994       CHECK_STRING (string);
 995       if (NILP (position))
 996         {
 997           p = SDATA (string);
 998         }
 999       else
1000         {
1001           CHECK_NATNUM (position);
1002           if (XINT (position) >= SCHARS (string))
1003             args_out_of_range (string, position);
1004           pos = XFASTINT (position);
1005           p = SDATA (string) + string_char_to_byte (string, pos);
1006         }
1007       if (! STRING_MULTIBYTE (string))
1008         return make_number (*p);
1009     }
1010   c = STRING_CHAR (p);
1011   if (CHAR_BYTE8_P (c))
1012     c = CHAR_TO_BYTE8 (c);
1013   else if (! ASCII_CHAR_P (c))
1014     error ("Not an ASCII nor an 8-bit character: %d", c);
1015   return make_number (c);
1016 }
1017
1018
1019 void
1020 init_character_once (void)
1021 {
1022 }
1023
1024 #ifdef emacs
1025
1026 void
1027 syms_of_character (void)
1028 {
1029   DEFSYM (Qcharacterp, "characterp");
1030   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1031
1032   staticpro (&Vchar_unify_table);
1033   Vchar_unify_table = Qnil;
1034
1035   defsubr (&Smax_char);
1036   defsubr (&Scharacterp);
1037   defsubr (&Sunibyte_char_to_multibyte);
1038   defsubr (&Smultibyte_char_to_unibyte);
1039   defsubr (&Schar_width);
1040   defsubr (&Sstring_width);
1041   defsubr (&Schar_direction);
1042   defsubr (&Sstring);
1043   defsubr (&Sunibyte_string);
1044   defsubr (&Schar_resolve_modifiers);
1045   defsubr (&Sget_byte);
1046
1047   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1048                doc: /*
1049 Vector recording all translation tables ever defined.
1050 Each element is a pair (SYMBOL . TABLE) relating the table to the
1051 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1052   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1053
1054   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1055                doc: /*
1056 A char-table for characters which invoke auto-filling.
1057 Such characters have value t in this table.  */);
1058   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1059   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1060   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1061
1062   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1063                doc: /*
1064 A char-table for width (columns) of each character.  */);
1065   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1066   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1067   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1068                         make_number (4));
1069
1070   DEFVAR_LISP ("char-direction-table", Vchar_direction_table,
1071                doc: /* A char-table for direction of each character.  */);
1072   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1073
1074   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1075                doc: /* A char-table for each printable character.  */);
1076   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1077   Fset_char_table_range (Vprintable_chars,
1078                          Fcons (make_number (32), make_number (126)), Qt);
1079   Fset_char_table_range (Vprintable_chars,
1080                          Fcons (make_number (160),
1081                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1082
1083   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1084                doc: /* Char table of script symbols.
1085 It has one extra slot whose value is a list of script symbols.  */);
1086
1087   /* Intern this now in case it isn't already done.
1088      Setting this variable twice is harmless.
1089      But don't staticpro it here--that is done in alloc.c.  */
1090   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1091   DEFSYM (Qchar_script_table, "char-script-table");
1092   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1093   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1094
1095   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1096                doc: /* Alist of scripts vs the representative characters.
1097 Each element is a cons (SCRIPT . CHARS).
1098 SCRIPT is a symbol representing a script or a subgroup of a script.
1099 CHARS is a list or a vector of characters.
1100 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1101 If it is a vector, one of the characters in the vector is necessary.
1102 This variable is used to find a font for a specific script.  */);
1103   Vscript_representative_chars = Qnil;
1104
1105   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1106                doc: /* Char table of Unicode's "General Category".
1107 All Unicode characters have one of the following values (symbol):
1108   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1109   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1110 See The Unicode Standard for the meaning of those values.  */);
1111   /* The correct char-table is setup in characters.el.  */
1112   Vunicode_category_table = Qnil;
1113 }
1114
1115 #endif /* emacs */