src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2004 Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* At first, see the document in `charset.h' to understand the code in
  24    this file.  */
  25
  26 #ifdef emacs
  27 #include <config.h>
  28 #endif
  29
  30 #include <stdio.h>
  31
  32 #ifdef emacs
  33
  34 #include <sys/types.h>
  35 #include "lisp.h"
  36 #include "buffer.h"
  37 #include "charset.h"
  38 #include "composite.h"
  39 #include "coding.h"
  40 #include "disptab.h"
  41
  42 #else  /* not emacs */
  43
  44 #include "mulelib.h"
  45
  46 #endif /* emacs */
  47
  48 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
  49 Lisp_Object Qunknown;
  50
  51 /* Declaration of special leading-codes.  */
  52 EMACS_INT leading_code_private_11; /* for private DIMENSION1 of 1-column */
  53 EMACS_INT leading_code_private_12; /* for private DIMENSION1 of 2-column */
  54 EMACS_INT leading_code_private_21; /* for private DIMENSION2 of 1-column */
  55 EMACS_INT leading_code_private_22; /* for private DIMENSION2 of 2-column */
  56
  57 /* Declaration of special charsets.  The values are set by
  58    Fsetup_special_charsets.  */
  59 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  60 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  61 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  62 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  63 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  64 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  65 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  66
  67 Lisp_Object Qcharset_table;
  68
  69 /* A char-table containing information of each character set.  */
  70 Lisp_Object Vcharset_table;
  71
  72 /* A vector of charset symbol indexed by charset-id.  This is used
  73    only for returning charset symbol from C functions.  */
  74 Lisp_Object Vcharset_symbol_table;
  75
  76 /* A list of charset symbols ever defined.  */
  77 Lisp_Object Vcharset_list;
  78
  79 /* Vector of translation table ever defined.
  80    ID of a translation table is used to index this vector.  */
  81 Lisp_Object Vtranslation_table_vector;
  82
  83 /* A char-table for characters which may invoke auto-filling.  */
  84 Lisp_Object Vauto_fill_chars;
  85
  86 Lisp_Object Qauto_fill_chars;
  87
  88 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  89 int bytes_by_char_head[256];
  90 int width_by_char_head[256];
  91
  92 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  93    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  94 int iso_charset_table[2][2][128];
  95
  96 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  97 unsigned char *_fetch_multibyte_char_p;
  98 int _fetch_multibyte_char_len;
  99
 100 /* Offset to add to a non-ASCII value when inserting it.  */
 101 EMACS_INT nonascii_insert_offset;
 102
 103 /* Translation table for converting non-ASCII unibyte characters
 104    to multibyte codes, or nil.  */
 105 Lisp_Object Vnonascii_translation_table;
 106
 107 /* List of all possible generic characters.  */
 108 Lisp_Object Vgeneric_character_list;
 109
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse string STR of length LENGTH and fetch information of a
 119    character at STR.  Set BYTES to the byte length the character
 120    occupies, CHARSET, C1, C2 to proper values of the character. */
 121
 122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)             \
 123   do {                                                                       \
 124     (c1) = *(str);                                                           \
 125     (bytes) = BYTES_BY_CHAR_HEAD (c1);                                       \
 126     if ((bytes) == 1)                                                        \
 127       (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
 128     else if ((bytes) == 2)                                                   \
 129       {                                                                      \
 130         if ((c1) == LEADING_CODE_8_BIT_CONTROL)                              \
 131           (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20;         \
 132         else                                                                 \
 133           (charset) = (c1), (c1) = (str)[1] & 0x7F;                          \
 134       }                                                                      \
 135     else if ((bytes) == 3)                                                   \
 136       {                                                                      \
 137         if ((c1) < LEADING_CODE_PRIVATE_11)                                  \
 138           (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F;  \
 139         else                                                                 \
 140           (charset) = (str)[1], (c1) = (str)[2] & 0x7F;                      \
 141       }                                                                      \
 142     else                                                                     \
 143       (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F;  \
 144   } while (0)
 145
 146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
 147    Note that this intentionally allows invalid components, such
 148    as 0xA0 0xA0, because there exist many files that contain
 149    such invalid byte sequences, especially in EUC-GB. */
 150 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 151   ((charset) == CHARSET_ASCII                           \
 152    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 153    : ((charset) == CHARSET_8_BIT_CONTROL                \
 154       ? ((c1) >= 0x80 && (c1) <= 0x9F)                  \
 155       : ((charset) == CHARSET_8_BIT_GRAPHIC             \
 156          ? ((c1) >= 0x80 && (c1) <= 0xFF)               \
 157          : (CHARSET_DIMENSION (charset) == 1            \
 158             ? ((c1) >= 0x20 && (c1) <= 0x7F)            \
 159             : ((c1) >= 0x20 && (c1) <= 0x7F             \
 160                && (c2) >= 0x20 && (c2) <= 0x7F)))))
 161
 162 /* Store multi-byte form of the character C in STR.  The caller should
 163    allocate at least 4-byte area at STR in advance.  Returns the
 164    length of the multi-byte form.  If C is an invalid character code,
 165    return -1.  */
 166
 167 int
 168 char_to_string_1 (c, str)
 169      int c;
 170      unsigned char *str;
 171 {
 172   unsigned char *p = str;
 173
 174   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 175     {
 176       /* Multibyte character can't have a modifier bit.  */
 177       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 178         return -1;
 179
 180       /* For Meta, Shift, and Control modifiers, we need special care.  */
 181       if (c & CHAR_META)
 182         {
 183           /* Move the meta bit to the right place for a string.  */
 184           c = (c & ~CHAR_META) | 0x80;
 185         }
 186       if (c & CHAR_SHIFT)
 187         {
 188           /* Shift modifier is valid only with [A-Za-z].  */
 189           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 190             c &= ~CHAR_SHIFT;
 191           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 192             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 193         }
 194       if (c & CHAR_CTL)
 195         {
 196           /* Simulate the code in lread.c.  */
 197           /* Allow `\C- ' and `\C-?'.  */
 198           if (c == (CHAR_CTL | ' '))
 199             c = 0;
 200           else if (c == (CHAR_CTL | '?'))
 201             c = 127;
 202           /* ASCII control chars are made from letters (both cases),
 203              as well as the non-letters within 0100...0137.  */
 204           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 205             c &= (037 | (~0177 & ~CHAR_CTL));
 206           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 207             c &= (037 | (~0177 & ~CHAR_CTL));
 208         }
 209
 210       /* If C still has any modifier bits, just ignore it.  */
 211       c &= ~CHAR_MODIFIER_MASK;
 212     }
 213
 214   if (SINGLE_BYTE_CHAR_P (c))
 215     {
 216       if (ASCII_BYTE_P (c) || c >= 0xA0)
 217         *p++ = c;
 218       else
 219         {
 220           *p++ = LEADING_CODE_8_BIT_CONTROL;
 221           *p++ = c + 0x20;
 222         }
 223     }
 224   else if (CHAR_VALID_P (c, 0))
 225     {
 226       int charset, c1, c2;
 227
 228       SPLIT_CHAR (c, charset, c1, c2);
 229
 230       if (charset >= LEADING_CODE_EXT_11)
 231         *p++ = (charset < LEADING_CODE_EXT_12
 232                 ? LEADING_CODE_PRIVATE_11
 233                 : (charset < LEADING_CODE_EXT_21
 234                    ? LEADING_CODE_PRIVATE_12
 235                    : (charset < LEADING_CODE_EXT_22
 236                       ? LEADING_CODE_PRIVATE_21
 237                       : LEADING_CODE_PRIVATE_22)));
 238       *p++ = charset;
 239       if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32))
 240         return -1;
 241       if (c1)
 242         {
 243           *p++ = c1 | 0x80;
 244           if (c2 > 0)
 245             *p++ = c2 | 0x80;
 246         }
 247     }
 248   else
 249     return -1;
 250
 251   return (p - str);
 252 }
 253
 254
 255 /* Store multi-byte form of the character C in STR.  The caller should
 256    allocate at least 4-byte area at STR in advance.  Returns the
 257    length of the multi-byte form.  If C is an invalid character code,
 258    signal an error.
 259
 260    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 261    directly if C can be an ASCII character.  */
 262
 263 int
 264 char_to_string (c, str)
 265      int c;
 266      unsigned char *str;
 267 {
 268   int len;
 269   len = char_to_string_1 (c, str);
 270   if (len == -1)
 271     invalid_character (c);
 272   return len;
 273 }
 274
 275
 276 /* Return the non-ASCII character corresponding to multi-byte form at
 277    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 278    length of the multibyte form in *ACTUAL_LEN.
 279
 280    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 281    this function directly if you want ot handle ASCII characters as
 282    well.  */
 283
 284 int
 285 string_to_char (str, len, actual_len)
 286      const unsigned char *str;
 287      int len, *actual_len;
 288 {
 289   int c, bytes, charset, c1, c2;
 290
 291   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 292   c = MAKE_CHAR (charset, c1, c2);
 293   if (actual_len)
 294     *actual_len = bytes;
 295   return c;
 296 }
 297
 298 /* Return the length of the multi-byte form at string STR of length LEN.
 299    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 300 int
 301 multibyte_form_length (str, len)
 302      const unsigned char *str;
 303      int len;
 304 {
 305   int bytes;
 306
 307   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 308   return bytes;
 309 }
 310
 311 /* Check multibyte form at string STR of length LEN and set variables
 312    pointed by CHARSET, C1, and C2 to charset and position codes of the
 313    character at STR, and return 0.  If there's no multibyte character,
 314    return -1.  This should be used only in the macro SPLIT_STRING
 315    which checks range of STR in advance.  */
 316
 317 int
 318 split_string (str, len, charset, c1, c2)
 319      const unsigned char *str;
 320      unsigned char *c1, *c2;
 321      int len, *charset;
 322 {
 323   register int bytes, cs, code1, code2 = -1;
 324
 325   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 326   if (cs == CHARSET_ASCII)
 327     return -1;
 328   *charset = cs;
 329   *c1 = code1;
 330   *c2 = code2;
 331   return 0;
 332 }
 333
 334 /* Return 1 iff character C has valid printable glyph.
 335    Use the macro CHAR_PRINTABLE_P instead.  */
 336 int
 337 char_printable_p (c)
 338      int c;
 339 {
 340   int charset, c1, c2;
 341
 342   if (ASCII_BYTE_P (c))
 343     return 1;
 344   else if (SINGLE_BYTE_CHAR_P (c))
 345     return 0;
 346   else if (c >= MAX_CHAR)
 347     return 0;
 348
 349   SPLIT_CHAR (c, charset, c1, c2);
 350   if (! CHARSET_DEFINED_P (charset))
 351     return 0;
 352   if (CHARSET_CHARS (charset) == 94
 353       ? c1 <= 32 || c1 >= 127
 354       : c1 < 32)
 355     return 0;
 356   if (CHARSET_DIMENSION (charset) == 2
 357       && (CHARSET_CHARS (charset) == 94
 358           ? c2 <= 32 || c2 >= 127
 359           : c2 < 32))
 360     return 0;
 361   return 1;
 362 }
 363
 364 /* Translate character C by translation table TABLE.  If C
 365    is negative, translate a character specified by CHARSET, C1, and C2
 366    (C1 and C2 are code points of the character).  If no translation is
 367    found in TABLE, return C.  */
 368 int
 369 translate_char (table, c, charset, c1, c2)
 370      Lisp_Object table;
 371      int c, charset, c1, c2;
 372 {
 373   Lisp_Object ch;
 374   int alt_charset, alt_c1, alt_c2, dimension;
 375
 376   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 377   if (!CHAR_TABLE_P (table)
 378       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 379     return c;
 380
 381   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 382   dimension = CHARSET_DIMENSION (alt_charset);
 383   if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0))
 384     /* CH is not a generic character, just return it.  */
 385     return XFASTINT (ch);
 386
 387   /* Since CH is a generic character, we must return a specific
 388      charater which has the same position codes as C from CH.  */
 389   if (charset < 0)
 390     SPLIT_CHAR (c, charset, c1, c2);
 391   if (dimension != CHARSET_DIMENSION (charset))
 392     /* We can't make such a character because of dimension mismatch.  */
 393     return c;
 394   return MAKE_CHAR (alt_charset, c1, c2);
 395 }
 396
 397 /* Convert the unibyte character C to multibyte based on
 398    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 399    convert C to a valid multibyte character, convert it based on
 400    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 401
 402 int
 403 unibyte_char_to_multibyte (c)
 404      int c;
 405 {
 406   if (c < 0400 && c >= 0200)
 407     {
 408       int c_save = c;
 409
 410       if (! NILP (Vnonascii_translation_table))
 411         {
 412           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 413           if (c >= 0400 && ! char_valid_p (c, 0))
 414             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 415         }
 416       else if (c >= 0240 && nonascii_insert_offset > 0)
 417         {
 418           c += nonascii_insert_offset;
 419           if (c < 0400 || ! char_valid_p (c, 0))
 420             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 421         }
 422       else if (c >= 0240)
 423         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 424     }
 425   return c;
 426 }
 427
 428
 429 /* Convert the multibyte character C to unibyte 8-bit character based
 430    on Vnonascii_translation_table or nonascii_insert_offset.  If
 431    REV_TBL is non-nil, it should be a reverse table of
 432    Vnonascii_translation_table, i.e. what given by:
 433      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 434
 435 int
 436 multibyte_char_to_unibyte (c, rev_tbl)
 437      int c;
 438      Lisp_Object rev_tbl;
 439 {
 440   if (!SINGLE_BYTE_CHAR_P (c))
 441     {
 442       int c_save = c;
 443
 444       if (! CHAR_TABLE_P (rev_tbl)
 445           && CHAR_TABLE_P (Vnonascii_translation_table))
 446         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 447                                           make_number (0));
 448       if (CHAR_TABLE_P (rev_tbl))
 449         {
 450           Lisp_Object temp;
 451           temp = Faref (rev_tbl, make_number (c));
 452           if (INTEGERP (temp))
 453             c = XINT (temp);
 454           if (c >= 256)
 455             c = (c_save & 0177) + 0200;
 456         }
 457       else
 458         {
 459           if (nonascii_insert_offset > 0)
 460             c -= nonascii_insert_offset;
 461           if (c < 128 || c >= 256)
 462             c = (c_save & 0177) + 0200;
 463         }
 464     }
 465
 466   return c;
 467 }
 468
 469 \f
 470 /* Update the table Vcharset_table with the given arguments (see the
 471    document of `define-charset' for the meaning of each argument).
 472    Several other table contents are also updated.  The caller should
 473    check the validity of CHARSET-ID and the remaining arguments in
 474    advance.  */
 475
 476 void
 477 update_charset_table (charset_id, dimension, chars, width, direction,
 478                       iso_final_char, iso_graphic_plane,
 479                       short_name, long_name, description)
 480      Lisp_Object charset_id, dimension, chars, width, direction;
 481      Lisp_Object iso_final_char, iso_graphic_plane;
 482      Lisp_Object short_name, long_name, description;
 483 {
 484   int charset = XINT (charset_id);
 485   int bytes;
 486   unsigned char leading_code_base, leading_code_ext;
 487
 488   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 489     CHARSET_TABLE_ENTRY (charset)
 490       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 491
 492   if (NILP (long_name))
 493     long_name = short_name;
 494   if (NILP (description))
 495     description = long_name;
 496
 497   /* Get byte length of multibyte form, base leading-code, and
 498      extended leading-code of the charset.  See the comment under the
 499      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 500   bytes = XINT (dimension);
 501   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 502     {
 503       /* Official charset, it doesn't have an extended leading-code.  */
 504       if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
 505         bytes += 1; /* For a base leading-code.  */
 506       leading_code_base = charset;
 507       leading_code_ext = 0;
 508     }
 509   else
 510     {
 511       /* Private charset.  */
 512       bytes += 2; /* For base and extended leading-codes.  */
 513       leading_code_base
 514         = (charset < LEADING_CODE_EXT_12
 515            ? LEADING_CODE_PRIVATE_11
 516            : (charset < LEADING_CODE_EXT_21
 517               ? LEADING_CODE_PRIVATE_12
 518               : (charset < LEADING_CODE_EXT_22
 519                  ? LEADING_CODE_PRIVATE_21
 520                  : LEADING_CODE_PRIVATE_22)));
 521       leading_code_ext = charset;
 522       if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 523         error ("Invalid dimension for the charset-ID %d", charset);
 524     }
 525
 526   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 527   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 528   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 529   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 530   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 531   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 532   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 533     = make_number (leading_code_base);
 534   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 535     = make_number (leading_code_ext);
 536   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 537   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 538     = iso_graphic_plane;
 539   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 540   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 541   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 542   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 543
 544   {
 545     /* If we have already defined a charset which has the same
 546        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 547        DIRECTION, we must update the entry REVERSE-CHARSET of both
 548        charsets.  If there's no such charset, the value of the entry
 549        is set to nil.  */
 550     int i;
 551
 552     for (i = 0; i <= MAX_CHARSET; i++)
 553       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 554         {
 555           if (CHARSET_DIMENSION (i) == XINT (dimension)
 556               && CHARSET_CHARS (i) == XINT (chars)
 557               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 558               && CHARSET_DIRECTION (i) != XINT (direction))
 559             {
 560               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 561                 = make_number (i);
 562               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 563               break;
 564             }
 565         }
 566     if (i > MAX_CHARSET)
 567       /* No such a charset.  */
 568       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 569         = make_number (-1);
 570   }
 571
 572   if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
 573       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 574     {
 575       bytes_by_char_head[leading_code_base] = bytes;
 576       width_by_char_head[leading_code_base] = XINT (width);
 577
 578       /* Update table emacs_code_class.  */
 579       emacs_code_class[charset] = (bytes == 2
 580                                    ? EMACS_leading_code_2
 581                                    : (bytes == 3
 582                                       ? EMACS_leading_code_3
 583                                       : EMACS_leading_code_4));
 584     }
 585
 586   /* Update table iso_charset_table.  */
 587   if (XINT (iso_final_char) >= 0
 588       && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 589     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 590 }
 591
 592 #ifdef emacs
 593
 594 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 595    is invalid.  */
 596 int
 597 get_charset_id (charset_symbol)
 598      Lisp_Object charset_symbol;
 599 {
 600   Lisp_Object val;
 601   int charset;
 602
 603   /* This originally used a ?: operator, but reportedly the HP-UX
 604      compiler version HP92453-01 A.10.32.22 miscompiles that.  */
 605   if (SYMBOLP (charset_symbol)
 606       && VECTORP (val = Fget (charset_symbol, Qcharset))
 607       && CHARSET_VALID_P (charset =
 608                           XINT (XVECTOR (val)->contents[CHARSET_ID_IDX])))
 609     return charset;
 610   else
 611     return -1;
 612 }
 613
 614 /* Return an identification number for a new private charset of
 615    DIMENSION and WIDTH.  If there's no more room for the new charset,
 616    return 0.  */
 617 Lisp_Object
 618 get_new_private_charset_id (dimension, width)
 619      int dimension, width;
 620 {
 621   int charset, from, to;
 622
 623   if (dimension == 1)
 624     {
 625       from = LEADING_CODE_EXT_11;
 626       to = LEADING_CODE_EXT_21;
 627     }
 628   else
 629     {
 630       from = LEADING_CODE_EXT_21;
 631       to = LEADING_CODE_EXT_MAX + 1;
 632     }
 633
 634   for (charset = from; charset < to; charset++)
 635     if (!CHARSET_DEFINED_P (charset)) break;
 636
 637   return make_number (charset < to ? charset : 0);
 638 }
 639
 640 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 641        doc: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.
 642 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is
 643  treated as a private charset.
 644 INFO-VECTOR is a vector of the format:
 645    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE
 646     SHORT-NAME LONG-NAME DESCRIPTION]
 647 The meanings of each elements is as follows:
 648 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
 649 CHARS (integer) is the number of characters in a dimension: 94 or 96.
 650 WIDTH (integer) is the number of columns a character in the charset
 651 occupies on the screen: one of 0, 1, and 2.
 652
 653 DIRECTION (integer) is the rendering direction of characters in the
 654 charset when rendering.  If 0, render from left to right, else
 655 render from right to left.
 656
 657 ISO-FINAL-CHAR (character) is the final character of the
 658 corresponding ISO 2022 charset.
 659 It may be -1 if the charset is internal use only.
 660
 661 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
 662 while encoding to variants of ISO 2022 coding system, one of the
 663 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
 664 It may be -1 if the charset is internal use only.
 665
 666 SHORT-NAME (string) is the short name to refer to the charset.
 667
 668 LONG-NAME (string) is the long name to refer to the charset.
 669
 670 DESCRIPTION (string) is the description string of the charset.  */)
 671        (charset_id, charset_symbol, info_vector)
 672      Lisp_Object charset_id, charset_symbol, info_vector;
 673 {
 674   Lisp_Object *vec;
 675
 676   if (!NILP (charset_id))
 677     CHECK_NUMBER (charset_id);
 678   CHECK_SYMBOL (charset_symbol);
 679   CHECK_VECTOR (info_vector);
 680
 681   if (! NILP (charset_id))
 682     {
 683       if (! CHARSET_VALID_P (XINT (charset_id)))
 684         error ("Invalid CHARSET: %d", XINT (charset_id));
 685       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 686         error ("Already defined charset: %d", XINT (charset_id));
 687     }
 688
 689   vec = XVECTOR (info_vector)->contents;
 690   if (XVECTOR (info_vector)->size != 9
 691       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 692       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 693       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 694       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 695       || !INTEGERP (vec[4])
 696       || !(XINT (vec[4]) == -1 || (XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~'))
 697       || !INTEGERP (vec[5])
 698       || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 699       || !STRINGP (vec[6])
 700       || !STRINGP (vec[7])
 701       || !STRINGP (vec[8]))
 702     error ("Invalid info-vector argument for defining charset %s",
 703            SDATA (SYMBOL_NAME (charset_symbol)));
 704
 705   if (NILP (charset_id))
 706     {
 707       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 708       if (XINT (charset_id) == 0)
 709         error ("There's no room for a new private charset %s",
 710                SDATA (SYMBOL_NAME (charset_symbol)));
 711     }
 712
 713   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 714                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 715   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 716   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 717   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 718   Fupdate_coding_systems_internal ();
 719   return Qnil;
 720 }
 721
 722 DEFUN ("generic-character-list", Fgeneric_character_list,
 723        Sgeneric_character_list, 0, 0, 0,
 724        doc: /* Return a list of all possible generic characters.
 725 It includes a generic character for a charset not yet defined.  */)
 726      ()
 727 {
 728   return Vgeneric_character_list;
 729 }
 730
 731 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 732        Sget_unused_iso_final_char, 2, 2, 0,
 733        doc: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS.
 734 DIMENSION is the number of bytes to represent a character: 1 or 2.
 735 CHARS is the number of characters in a dimension: 94 or 96.
 736
 737 This final char is for private use, thus the range is `0' (48) .. `?' (63).
 738 If there's no unused final char for the specified kind of charset,
 739 return nil.  */)
 740      (dimension, chars)
 741      Lisp_Object dimension, chars;
 742 {
 743   int final_char;
 744
 745   CHECK_NUMBER (dimension);
 746   CHECK_NUMBER (chars);
 747   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 748     error ("Invalid charset dimension %d, it should be 1 or 2",
 749            XINT (dimension));
 750   if (XINT (chars) != 94 && XINT (chars) != 96)
 751     error ("Invalid charset chars %d, it should be 94 or 96",
 752            XINT (chars));
 753   for (final_char = '0'; final_char <= '?'; final_char++)
 754     {
 755       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 756         break;
 757     }
 758   return (final_char <= '?' ? make_number (final_char) : Qnil);
 759 }
 760
 761 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 762        4, 4, 0,
 763        doc: /* Declare an equivalent charset for ISO-2022 decoding.
 764
 765 On decoding by an ISO-2022 base coding system, when a charset
 766 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
 767 if CHARSET is designated instead.  */)
 768      (dimension, chars, final_char, charset)
 769      Lisp_Object dimension, chars, final_char, charset;
 770 {
 771   int charset_id;
 772
 773   CHECK_NUMBER (dimension);
 774   CHECK_NUMBER (chars);
 775   CHECK_NUMBER (final_char);
 776   CHECK_SYMBOL (charset);
 777
 778   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 779     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 780   if (XINT (chars) != 94 && XINT (chars) != 96)
 781     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 782   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 783     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 784   if ((charset_id = get_charset_id (charset)) < 0)
 785     error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset)));
 786
 787   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset_id;
 788   return Qnil;
 789 }
 790
 791 /* Return information about charsets in the text at PTR of NBYTES
 792    bytes, which are NCHARS characters.  The value is:
 793
 794         0: Each character is represented by one byte.  This is always
 795            true for unibyte text.
 796         1: No charsets other than ascii eight-bit-control,
 797            eight-bit-graphic, and latin-1 are found.
 798         2: Otherwise.
 799
 800    In addition, if CHARSETS is nonzero, for each found charset N, set
 801    CHARSETS[N] to 1.  For that, callers should allocate CHARSETS
 802    (MAX_CHARSET + 1 elements) in advance.  It may lookup a translation
 803    table TABLE if supplied.  For invalid charsets, set CHARSETS[1] to
 804    1 (note that there's no charset whose ID is 1).  */
 805
 806 int
 807 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
 808      const unsigned char *ptr;
 809      int nchars, nbytes, *charsets;
 810      Lisp_Object table;
 811 {
 812   if (nchars == nbytes)
 813     {
 814       if (charsets && nbytes > 0)
 815         {
 816           const unsigned char *endp = ptr + nbytes;
 817           int maskbits = 0;
 818
 819           while (ptr < endp && maskbits != 7)
 820             {
 821               maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
 822               ptr++;
 823             }
 824
 825           if (maskbits & 1)
 826             charsets[CHARSET_ASCII] = 1;
 827           if (maskbits & 2)
 828             charsets[CHARSET_8_BIT_CONTROL] = 1;
 829           if (maskbits & 4)
 830             charsets[CHARSET_8_BIT_GRAPHIC] = 1;
 831         }
 832       return 0;
 833     }
 834   else
 835     {
 836       int return_val = 1;
 837       int bytes, charset, c1, c2;
 838
 839       if (! CHAR_TABLE_P (table))
 840         table = Qnil;
 841
 842       while (nchars-- > 0)
 843         {
 844           SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
 845           ptr += bytes;
 846
 847           if (!CHARSET_DEFINED_P (charset))
 848             charset = 1;
 849           else if (! NILP (table))
 850             {
 851               int c = translate_char (table, -1, charset, c1, c2);
 852               if (c >= 0)
 853                 charset = CHAR_CHARSET (c);
 854             }
 855
 856           if (return_val == 1
 857               && charset != CHARSET_ASCII
 858               && charset != CHARSET_8_BIT_CONTROL
 859               && charset != CHARSET_8_BIT_GRAPHIC
 860               && charset != charset_latin_iso8859_1)
 861             return_val = 2;
 862
 863           if (charsets)
 864             charsets[charset] = 1;
 865           else if (return_val == 2)
 866             break;
 867         }
 868       return return_val;
 869     }
 870 }
 871
 872 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 873        2, 3, 0,
 874        doc: /* Return a list of charsets in the region between BEG and END.
 875 BEG and END are buffer positions.
 876 Optional arg TABLE if non-nil is a translation table to look up.
 877
 878 If the region contains invalid multibyte characters,
 879 `unknown' is included in the returned list.
 880
 881 If the current buffer is unibyte, the returned list may contain
 882 only `ascii', `eight-bit-control', and `eight-bit-graphic'.  */)
 883      (beg, end, table)
 884      Lisp_Object beg, end, table;
 885 {
 886   int charsets[MAX_CHARSET + 1];
 887   int from, from_byte, to, stop, stop_byte, i;
 888   Lisp_Object val;
 889
 890   validate_region (&beg, &end);
 891   from = XFASTINT (beg);
 892   stop = to = XFASTINT (end);
 893
 894   if (from < GPT && GPT < to)
 895     {
 896       stop = GPT;
 897       stop_byte = GPT_BYTE;
 898     }
 899   else
 900     stop_byte = CHAR_TO_BYTE (stop);
 901
 902   from_byte = CHAR_TO_BYTE (from);
 903
 904   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 905   while (1)
 906     {
 907       find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
 908                             stop_byte - from_byte, charsets, table);
 909       if (stop < to)
 910         {
 911           from = stop, from_byte = stop_byte;
 912           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 913         }
 914       else
 915         break;
 916     }
 917
 918   val = Qnil;
 919   if (charsets[1])
 920     val = Fcons (Qunknown, val);
 921   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 922     if (charsets[i])
 923       val = Fcons (CHARSET_SYMBOL (i), val);
 924   if (charsets[0])
 925     val = Fcons (Qascii, val);
 926   return val;
 927 }
 928
 929 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 930        1, 2, 0,
 931        doc: /* Return a list of charsets in STR.
 932 Optional arg TABLE if non-nil is a translation table to look up.
 933
 934 If the string contains invalid multibyte characters,
 935 `unknown' is included in the returned list.
 936
 937 If STR is unibyte, the returned list may contain
 938 only `ascii', `eight-bit-control', and `eight-bit-graphic'.  */)
 939      (str, table)
 940      Lisp_Object str, table;
 941 {
 942   int charsets[MAX_CHARSET + 1];
 943   int i;
 944   Lisp_Object val;
 945
 946   CHECK_STRING (str);
 947
 948   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 949   find_charset_in_text (SDATA (str), SCHARS (str),
 950                         SBYTES (str), charsets, table);
 951
 952   val = Qnil;
 953   if (charsets[1])
 954     val = Fcons (Qunknown, val);
 955   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 956     if (charsets[i])
 957       val = Fcons (CHARSET_SYMBOL (i), val);
 958   if (charsets[0])
 959     val = Fcons (Qascii, val);
 960   return val;
 961 }
 962
 963 \f
 964 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 965        doc: /* Return a character made from arguments.
 966 Internal use only.  */)
 967      (charset, code1, code2)
 968      Lisp_Object charset, code1, code2;
 969 {
 970   int charset_id, c1, c2;
 971
 972   CHECK_NUMBER (charset);
 973   charset_id = XINT (charset);
 974   if (!CHARSET_DEFINED_P (charset_id))
 975     error ("Invalid charset ID: %d", XINT (charset));
 976
 977   if (NILP (code1))
 978     c1 = 0;
 979   else
 980     {
 981       CHECK_NUMBER (code1);
 982       c1 = XINT (code1);
 983     }
 984   if (NILP (code2))
 985     c2 = 0;
 986   else
 987     {
 988       CHECK_NUMBER (code2);
 989       c2 = XINT (code2);
 990     }
 991
 992   if (charset_id == CHARSET_ASCII)
 993     {
 994       if (c1 < 0 || c1 > 0x7F)
 995         goto invalid_code_posints;
 996       return make_number (c1);
 997     }
 998   else if (charset_id == CHARSET_8_BIT_CONTROL)
 999     {
1000       if (NILP (code1))
1001         c1 = 0x80;
1002       else if (c1 < 0x80 || c1 > 0x9F)
1003         goto invalid_code_posints;
1004       return make_number (c1);
1005     }
1006   else if (charset_id == CHARSET_8_BIT_GRAPHIC)
1007     {
1008       if (NILP (code1))
1009         c1 = 0xA0;
1010       else if (c1 < 0xA0 || c1 > 0xFF)
1011         goto invalid_code_posints;
1012       return make_number (c1);
1013     }
1014   else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1015     goto invalid_code_posints;
1016   c1 &= 0x7F;
1017   c2 &= 0x7F;
1018   if (c1 == 0
1019       ? c2 != 0
1020       : (c2 == 0
1021          ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
1022          : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
1023     goto invalid_code_posints;
1024   return make_number (MAKE_CHAR (charset_id, c1, c2));
1025
1026  invalid_code_posints:
1027   error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1028 }
1029
1030 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1031        doc: /* Return list of charset and one or two position-codes of CH.
1032 If CH is invalid as a character code,
1033 return a list of symbol `unknown' and CH.  */)
1034      (ch)
1035      Lisp_Object ch;
1036 {
1037   int c, charset, c1, c2;
1038
1039   CHECK_NUMBER (ch);
1040   c = XFASTINT (ch);
1041   if (!CHAR_VALID_P (c, 1))
1042     return Fcons (Qunknown, Fcons (ch, Qnil));
1043   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1044   return (c2 >= 0
1045           ? Fcons (CHARSET_SYMBOL (charset),
1046                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1047           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1048 }
1049
1050 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1051        doc: /* Return charset of CH.  */)
1052      (ch)
1053      Lisp_Object ch;
1054 {
1055   CHECK_NUMBER (ch);
1056
1057   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1058 }
1059
1060 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1061        doc: /* Return charset of a character in the current buffer at position POS.
1062 If POS is nil, it defauls to the current point.
1063 If POS is out of range, the value is nil.  */)
1064      (pos)
1065      Lisp_Object pos;
1066 {
1067   Lisp_Object ch;
1068   int charset;
1069
1070   ch = Fchar_after (pos);
1071   if (! INTEGERP (ch))
1072     return ch;
1073   charset = CHAR_CHARSET (XINT (ch));
1074   return CHARSET_SYMBOL (charset);
1075 }
1076
1077 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1078        doc: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1079
1080 ISO 2022's designation sequence (escape sequence) distinguishes charsets
1081 by their DIMENSION, CHARS, and FINAL-CHAR,
1082 where as Emacs distinguishes them by charset symbol.
1083 See the documentation of the function `charset-info' for the meanings of
1084 DIMENSION, CHARS, and FINAL-CHAR.  */)
1085      (dimension, chars, final_char)
1086      Lisp_Object dimension, chars, final_char;
1087 {
1088   int charset;
1089
1090   CHECK_NUMBER (dimension);
1091   CHECK_NUMBER (chars);
1092   CHECK_NUMBER (final_char);
1093
1094   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1095     return Qnil;
1096   return CHARSET_SYMBOL (charset);
1097 }
1098
1099 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1100    generic character.  If GENERICP is zero, return nonzero iff C is a
1101    valid normal character.  Do not call this function directly,
1102    instead use macro CHAR_VALID_P.  */
1103 int
1104 char_valid_p (c, genericp)
1105      int c, genericp;
1106 {
1107   int charset, c1, c2;
1108
1109   if (c < 0 || c >= MAX_CHAR)
1110     return 0;
1111   if (SINGLE_BYTE_CHAR_P (c))
1112     return 1;
1113   SPLIT_CHAR (c, charset, c1, c2);
1114   if (genericp)
1115     {
1116       if (c1)
1117         {
1118           if (c2 <= 0) c2 = 0x20;
1119         }
1120       else
1121         {
1122           if (c2 <= 0) c1 = c2 = 0x20;
1123         }
1124     }
1125   return (CHARSET_DEFINED_P (charset)
1126           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1127 }
1128
1129 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1130        doc: /* Return t if OBJECT is a valid normal character.
1131 If optional arg GENERICP is non-nil, also return t if OBJECT is
1132 a valid generic character.  */)
1133      (object, genericp)
1134      Lisp_Object object, genericp;
1135 {
1136   if (! NATNUMP (object))
1137     return Qnil;
1138   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1139 }
1140
1141 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1142        Sunibyte_char_to_multibyte, 1, 1, 0,
1143        doc: /* Convert the unibyte character CH to multibyte character.
1144 The conversion is done based on `nonascii-translation-table' (which see)
1145  or `nonascii-insert-offset' (which see).  */)
1146      (ch)
1147      Lisp_Object ch;
1148 {
1149   int c;
1150
1151   CHECK_NUMBER (ch);
1152   c = XINT (ch);
1153   if (c < 0 || c >= 0400)
1154     error ("Invalid unibyte character: %d", c);
1155   c = unibyte_char_to_multibyte (c);
1156   if (c < 0)
1157     error ("Can't convert to multibyte character: %d", XINT (ch));
1158   return make_number (c);
1159 }
1160
1161 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1162        Smultibyte_char_to_unibyte, 1, 1, 0,
1163        doc: /* Convert the multibyte character CH to unibyte character.
1164 The conversion is done based on `nonascii-translation-table' (which see)
1165  or `nonascii-insert-offset' (which see).  */)
1166      (ch)
1167      Lisp_Object ch;
1168 {
1169   int c;
1170
1171   CHECK_NUMBER (ch);
1172   c = XINT (ch);
1173   if (! CHAR_VALID_P (c, 0))
1174     error ("Invalid multibyte character: %d", c);
1175   c = multibyte_char_to_unibyte (c, Qnil);
1176   if (c < 0)
1177     error ("Can't convert to unibyte character: %d", XINT (ch));
1178   return make_number (c);
1179 }
1180
1181 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1182        doc: /* Return 1 regardless of the argument CH.  */)
1183      (ch)
1184      Lisp_Object ch;
1185 {
1186   CHECK_NUMBER (ch);
1187   return make_number (1);
1188 }
1189
1190 /* Return how many bytes C will occupy in a multibyte buffer.
1191    Don't call this function directly, instead use macro CHAR_BYTES.  */
1192 int
1193 char_bytes (c)
1194      int c;
1195 {
1196   int charset;
1197
1198   if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1199     return 1;
1200   if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1201     return 1;
1202
1203   charset = CHAR_CHARSET (c);
1204   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1205 }
1206
1207 /* Return the width of character of which multi-byte form starts with
1208    C.  The width is measured by how many columns occupied on the
1209    screen when displayed in the current buffer.  */
1210
1211 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1212   (c < 0x20                                                             \
1213    ? (c == '\t'                                                         \
1214       ? XFASTINT (current_buffer->tab_width)                            \
1215       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1216    : (c < 0x7f                                                          \
1217       ? 1                                                               \
1218       : (c == 0x7F                                                      \
1219          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1220          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1221              && BASE_LEADING_CODE_P (c))                                \
1222             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1223             : 4))))
1224
1225 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1226        doc: /* Return width of CH when displayed in the current buffer.
1227 The width is measured by how many columns it occupies on the screen.
1228 Tab is taken to occupy `tab-width' columns.  */)
1229      (ch)
1230      Lisp_Object ch;
1231 {
1232   Lisp_Object val, disp;
1233   int c;
1234   struct Lisp_Char_Table *dp = buffer_display_table ();
1235
1236   CHECK_NUMBER (ch);
1237
1238   c = XINT (ch);
1239
1240   /* Get the way the display table would display it.  */
1241   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1242
1243   if (VECTORP (disp))
1244     XSETINT (val, XVECTOR (disp)->size);
1245   else if (SINGLE_BYTE_CHAR_P (c))
1246     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1247   else
1248     {
1249       int charset = CHAR_CHARSET (c);
1250
1251       XSETFASTINT (val, CHARSET_WIDTH (charset));
1252     }
1253   return val;
1254 }
1255
1256 /* Return width of string STR of length LEN when displayed in the
1257    current buffer.  The width is measured by how many columns it
1258    occupies on the screen.  */
1259
1260 int
1261 strwidth (str, len)
1262      unsigned char *str;
1263      int len;
1264 {
1265   return c_string_width (str, len, -1, NULL, NULL);
1266 }
1267
1268 /* Return width of string STR of length LEN when displayed in the
1269    current buffer.  The width is measured by how many columns it
1270    occupies on the screen.  If PRECISION > 0, return the width of
1271    longest substring that doesn't exceed PRECISION, and set number of
1272    characters and bytes of the substring in *NCHARS and *NBYTES
1273    respectively.  */
1274
1275 int
1276 c_string_width (str, len, precision, nchars, nbytes)
1277      const unsigned char *str;
1278      int len, precision, *nchars, *nbytes;
1279 {
1280   int i = 0, i_byte = 0;
1281   int width = 0;
1282   int chars;
1283   struct Lisp_Char_Table *dp = buffer_display_table ();
1284
1285   while (i_byte < len)
1286     {
1287       int bytes, thiswidth;
1288       Lisp_Object val;
1289
1290       if (dp)
1291         {
1292           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1293
1294           chars = 1;
1295           val = DISP_CHAR_VECTOR (dp, c);
1296           if (VECTORP (val))
1297             thiswidth = XVECTOR (val)->size;
1298           else
1299             thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1300         }
1301       else
1302         {
1303           chars = 1;
1304           PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes);
1305           thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1306         }
1307
1308       if (precision > 0
1309           && (width + thiswidth > precision))
1310         {
1311           *nchars = i;
1312           *nbytes = i_byte;
1313           return width;
1314         }
1315       i++;
1316       i_byte += bytes;
1317       width += thiswidth;
1318   }
1319
1320   if (precision > 0)
1321     {
1322       *nchars = i;
1323       *nbytes = i_byte;
1324     }
1325
1326   return width;
1327 }
1328
1329 /* Return width of Lisp string STRING when displayed in the current
1330    buffer.  The width is measured by how many columns it occupies on
1331    the screen while paying attention to compositions.  If PRECISION >
1332    0, return the width of longest substring that doesn't exceed
1333    PRECISION, and set number of characters and bytes of the substring
1334    in *NCHARS and *NBYTES respectively.  */
1335
1336 int
1337 lisp_string_width (string, precision, nchars, nbytes)
1338      Lisp_Object string;
1339      int precision, *nchars, *nbytes;
1340 {
1341   int len = SCHARS (string);
1342   int len_byte = SBYTES (string);
1343   const unsigned char *str = SDATA (string);
1344   int i = 0, i_byte = 0;
1345   int width = 0;
1346   struct Lisp_Char_Table *dp = buffer_display_table ();
1347
1348   while (i < len)
1349     {
1350       int chars, bytes, thiswidth;
1351       Lisp_Object val;
1352       int cmp_id;
1353       int ignore, end;
1354
1355       if (find_composition (i, -1, &ignore, &end, &val, string)
1356           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
1357               >= 0))
1358         {
1359           thiswidth = composition_table[cmp_id]->width;
1360           chars = end - i;
1361           bytes = string_char_to_byte (string, end) - i_byte;
1362         }
1363       else if (dp)
1364         {
1365           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1366
1367           chars = 1;
1368           val = DISP_CHAR_VECTOR (dp, c);
1369           if (VECTORP (val))
1370             thiswidth = XVECTOR (val)->size;
1371           else
1372             thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1373         }
1374       else
1375         {
1376           chars = 1;
1377           PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes);
1378           thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1379         }
1380
1381       if (precision > 0
1382           && (width + thiswidth > precision))
1383         {
1384           *nchars = i;
1385           *nbytes = i_byte;
1386           return width;
1387         }
1388       i += chars;
1389       i_byte += bytes;
1390       width += thiswidth;
1391   }
1392
1393   if (precision > 0)
1394     {
1395       *nchars = i;
1396       *nbytes = i_byte;
1397     }
1398
1399   return width;
1400 }
1401
1402 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1403        doc: /* Return width of STRING when displayed in the current buffer.
1404 Width is measured by how many columns it occupies on the screen.
1405 When calculating width of a multibyte character in STRING,
1406 only the base leading-code is considered; the validity of
1407 the following bytes is not checked.  Tabs in STRING are always
1408 taken to occupy `tab-width' columns.  */)
1409      (string)
1410      Lisp_Object string;
1411 {
1412   Lisp_Object val;
1413
1414   CHECK_STRING (string);
1415   XSETFASTINT (val, lisp_string_width (string, -1, NULL, NULL));
1416   return val;
1417 }
1418
1419 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1420        doc: /* Return the direction of CH.
1421 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
1422      (ch)
1423      Lisp_Object ch;
1424 {
1425   int charset;
1426
1427   CHECK_NUMBER (ch);
1428   charset = CHAR_CHARSET (XFASTINT (ch));
1429   if (!CHARSET_DEFINED_P (charset))
1430     invalid_character (XINT (ch));
1431   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1432 }
1433
1434 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1435        doc: /* Return number of characters between BEG and END.  */)
1436      (beg, end)
1437      Lisp_Object beg, end;
1438 {
1439   int from, to;
1440
1441   CHECK_NUMBER_COERCE_MARKER (beg);
1442   CHECK_NUMBER_COERCE_MARKER (end);
1443
1444   from = min (XFASTINT (beg), XFASTINT (end));
1445   to = max (XFASTINT (beg), XFASTINT (end));
1446
1447   return make_number (to - from);
1448 }
1449
1450 /* Return the number of characters in the NBYTES bytes at PTR.
1451    This works by looking at the contents and checking for multibyte sequences.
1452    However, if the current buffer has enable-multibyte-characters = nil,
1453    we treat each byte as a character.  */
1454
1455 int
1456 chars_in_text (ptr, nbytes)
1457      const unsigned char *ptr;
1458      int nbytes;
1459 {
1460   /* current_buffer is null at early stages of Emacs initialization.  */
1461   if (current_buffer == 0
1462       || NILP (current_buffer->enable_multibyte_characters))
1463     return nbytes;
1464
1465   return multibyte_chars_in_text (ptr, nbytes);
1466 }
1467
1468 /* Return the number of characters in the NBYTES bytes at PTR.
1469    This works by looking at the contents and checking for multibyte sequences.
1470    It ignores enable-multibyte-characters.  */
1471
1472 int
1473 multibyte_chars_in_text (ptr, nbytes)
1474      const unsigned char *ptr;
1475      int nbytes;
1476 {
1477   const unsigned char *endp;
1478   int chars, bytes;
1479
1480   endp = ptr + nbytes;
1481   chars = 0;
1482
1483   while (ptr < endp)
1484     {
1485       PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1486       ptr += bytes;
1487       chars++;
1488     }
1489
1490   return chars;
1491 }
1492
1493 /* Parse unibyte text at STR of LEN bytes as multibyte text, and
1494    count the numbers of characters and bytes in it.  On counting
1495    bytes, pay attention to the fact that 8-bit characters in the range
1496    0x80..0x9F are represented by 2 bytes in multibyte text.  */
1497 void
1498 parse_str_as_multibyte (str, len, nchars, nbytes)
1499      const unsigned char *str;
1500      int len, *nchars, *nbytes;
1501 {
1502   const unsigned char *endp = str + len;
1503   int n, chars = 0, bytes = 0;
1504
1505   while (str < endp)
1506     {
1507       if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1508         str += n, bytes += n;
1509       else
1510         str++, bytes += 2;
1511       chars++;
1512     }
1513   *nchars = chars;
1514   *nbytes = bytes;
1515   return;
1516 }
1517
1518 /* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1519    It actually converts only 8-bit characters in the range 0x80..0x9F
1520    that don't contruct multibyte characters to multibyte forms.  If
1521    NCHARS is nonzero, set *NCHARS to the number of characters in the
1522    text.  It is assured that we can use LEN bytes at STR as a work
1523    area and that is enough.  Return the number of bytes of the
1524    resulting text.  */
1525
1526 int
1527 str_as_multibyte (str, len, nbytes, nchars)
1528      unsigned char *str;
1529      int len, nbytes, *nchars;
1530 {
1531   unsigned char *p = str, *endp = str + nbytes;
1532   unsigned char *to;
1533   int chars = 0;
1534   int n;
1535
1536   while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1537     p += n, chars++;
1538   if (nchars)
1539     *nchars = chars;
1540   if (p == endp)
1541     return nbytes;
1542
1543   to = p;
1544   nbytes = endp - p;
1545   endp = str + len;
1546   safe_bcopy (p, endp - nbytes, nbytes);
1547   p = endp - nbytes;
1548   while (p < endp)
1549     {
1550       if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1551         {
1552           while (n--)
1553             *to++ = *p++;
1554         }
1555       else
1556         {
1557           *to++ = LEADING_CODE_8_BIT_CONTROL;
1558           *to++ = *p++ + 0x20;
1559         }
1560       chars++;
1561     }
1562   if (nchars)
1563     *nchars = chars;
1564   return (to - str);
1565 }
1566
1567 /* Parse unibyte string at STR of LEN bytes, and return the number of
1568    bytes it may ocupy when converted to multibyte string by
1569    `str_to_multibyte'.  */
1570
1571 int
1572 parse_str_to_multibyte (str, len)
1573      unsigned char *str;
1574      int len;
1575 {
1576   unsigned char *endp = str + len;
1577   int bytes;
1578
1579   for (bytes = 0; str < endp; str++)
1580     bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2;
1581   return bytes;
1582 }
1583
1584 /* Convert unibyte text at STR of NBYTES bytes to multibyte text
1585    that contains the same single-byte characters.  It actually
1586    converts all 8-bit characters to multibyte forms.  It is assured
1587    that we can use LEN bytes at STR as a work area and that is
1588    enough.  */
1589
1590 int
1591 str_to_multibyte (str, len, bytes)
1592      unsigned char *str;
1593      int len, bytes;
1594 {
1595   unsigned char *p = str, *endp = str + bytes;
1596   unsigned char *to;
1597
1598   while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1599   if (p == endp)
1600     return bytes;
1601   to = p;
1602   bytes = endp - p;
1603   endp = str + len;
1604   safe_bcopy (p, endp - bytes, bytes);
1605   p = endp - bytes;
1606   while (p < endp)
1607     {
1608       if (*p < 0x80 || *p >= 0xA0)
1609         *to++ = *p++;
1610       else
1611         *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1612     }
1613   return (to - str);
1614 }
1615
1616 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
1617    actually converts only 8-bit characters in the range 0x80..0x9F to
1618    unibyte forms.  */
1619
1620 int
1621 str_as_unibyte (str, bytes)
1622      unsigned char *str;
1623      int bytes;
1624 {
1625   unsigned char *p = str, *endp = str + bytes;
1626   unsigned char *to = str;
1627
1628   while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1629   to = p;
1630   while (p < endp)
1631     {
1632       if (*p == LEADING_CODE_8_BIT_CONTROL)
1633         *to++ = *(p + 1) - 0x20, p += 2;
1634       else
1635         *to++ = *p++;
1636     }
1637   return (to - str);
1638 }
1639
1640 \f
1641 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
1642   doc: /* Concatenate all the argument characters and make the result a string.
1643 usage: (string &rest CHARACTERS)  */)
1644      (n, args)
1645      int n;
1646      Lisp_Object *args;
1647 {
1648   int i, bufsize;
1649   unsigned char *buf, *p;
1650   int c;
1651   int multibyte = 0;
1652   Lisp_Object ret;
1653   USE_SAFE_ALLOCA;
1654
1655   bufsize = MAX_MULTIBYTE_LENGTH * n;
1656   SAFE_ALLOCA (buf, unsigned char *, bufsize);
1657   p = buf;
1658
1659   for (i = 0; i < n; i++)
1660     {
1661       CHECK_NUMBER (args[i]);
1662       if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i])))
1663         multibyte = 1;
1664     }
1665
1666   for (i = 0; i < n; i++)
1667     {
1668       c = XINT (args[i]);
1669       if (multibyte)
1670         p += CHAR_STRING (c, p);
1671       else
1672         *p++ = c;
1673     }
1674
1675   ret = make_string_from_bytes (buf, n, p - buf);
1676   SAFE_FREE ();
1677
1678   return ret;
1679 }
1680
1681 #endif /* emacs */
1682 \f
1683 int
1684 charset_id_internal (charset_name)
1685      char *charset_name;
1686 {
1687   Lisp_Object val;
1688
1689   val= Fget (intern (charset_name), Qcharset);
1690   if (!VECTORP (val))
1691     error ("Charset %s is not defined", charset_name);
1692
1693   return (XINT (XVECTOR (val)->contents[0]));
1694 }
1695
1696 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1697        Ssetup_special_charsets, 0, 0, 0, doc: /* Internal use only.  */)
1698      ()
1699 {
1700   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1701   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1702   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1703   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1704   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1705   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1706   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1707   return Qnil;
1708 }
1709
1710 void
1711 init_charset_once ()
1712 {
1713   int i, j, k;
1714
1715   staticpro (&Vcharset_table);
1716   staticpro (&Vcharset_symbol_table);
1717   staticpro (&Vgeneric_character_list);
1718
1719   /* This has to be done here, before we call Fmake_char_table.  */
1720   Qcharset_table = intern ("charset-table");
1721   staticpro (&Qcharset_table);
1722
1723   /* Intern this now in case it isn't already done.
1724      Setting this variable twice is harmless.
1725      But don't staticpro it here--that is done in alloc.c.  */
1726   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1727
1728   /* Now we are ready to set up this property, so we can
1729      create the charset table.  */
1730   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1731   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1732
1733   Qunknown = intern ("unknown");
1734   staticpro (&Qunknown);
1735   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1736                                         Qunknown);
1737
1738   /* Setup tables.  */
1739   for (i = 0; i < 2; i++)
1740     for (j = 0; j < 2; j++)
1741       for (k = 0; k < 128; k++)
1742         iso_charset_table [i][j][k] = -1;
1743
1744   for (i = 0; i < 256; i++)
1745     bytes_by_char_head[i] = 1;
1746   bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1747   bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1748   bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1749   bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1750
1751   for (i = 0; i < 128; i++)
1752     width_by_char_head[i] = 1;
1753   for (; i < 256; i++)
1754     width_by_char_head[i] = 4;
1755   width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1756   width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1757   width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1758   width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1759
1760   {
1761     Lisp_Object val;
1762
1763     val = Qnil;
1764     for (i = 0x81; i < 0x90; i++)
1765       val = Fcons (make_number ((i - 0x70) << 7), val);
1766     for (; i < 0x9A; i++)
1767       val = Fcons (make_number ((i - 0x8F) << 14), val);
1768     for (i = 0xA0; i < 0xF0; i++)
1769       val = Fcons (make_number ((i - 0x70) << 7), val);
1770     for (; i < 0xFF; i++)
1771       val = Fcons (make_number ((i - 0xE0) << 14), val);
1772     Vgeneric_character_list = Fnreverse (val);
1773   }
1774
1775   nonascii_insert_offset = 0;
1776   Vnonascii_translation_table = Qnil;
1777 }
1778
1779 #ifdef emacs
1780
1781 void
1782 syms_of_charset ()
1783 {
1784   Qcharset = intern ("charset");
1785   staticpro (&Qcharset);
1786
1787   Qascii = intern ("ascii");
1788   staticpro (&Qascii);
1789
1790   Qeight_bit_control = intern ("eight-bit-control");
1791   staticpro (&Qeight_bit_control);
1792
1793   Qeight_bit_graphic = intern ("eight-bit-graphic");
1794   staticpro (&Qeight_bit_graphic);
1795
1796   /* Define special charsets ascii, eight-bit-control, and
1797      eight-bit-graphic.  */
1798   update_charset_table (make_number (CHARSET_ASCII),
1799                         make_number (1), make_number (94),
1800                         make_number (1),
1801                         make_number (0),
1802                         make_number ('B'),
1803                         make_number (0),
1804                         build_string ("ASCII"),
1805                         Qnil,   /* same as above */
1806                         build_string ("ASCII (ISO646 IRV)"));
1807   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1808   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1809
1810   update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1811                         make_number (1), make_number (96),
1812                         make_number (4),
1813                         make_number (0),
1814                         make_number (-1),
1815                         make_number (-1),
1816                         build_string ("8-bit control code (0x80..0x9F)"),
1817                         Qnil,   /* same as above */
1818                         Qnil);  /* same as above */
1819   CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1820   Fput (Qeight_bit_control, Qcharset,
1821         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1822
1823   update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1824                         make_number (1), make_number (96),
1825                         make_number (4),
1826                         make_number (0),
1827                         make_number (-1),
1828                         make_number (-1),
1829                         build_string ("8-bit graphic char (0xA0..0xFF)"),
1830                         Qnil,   /* same as above */
1831                         Qnil);  /* same as above */
1832   CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1833   Fput (Qeight_bit_graphic, Qcharset,
1834         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1835
1836   Qauto_fill_chars = intern ("auto-fill-chars");
1837   staticpro (&Qauto_fill_chars);
1838   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1839
1840   defsubr (&Sdefine_charset);
1841   defsubr (&Sgeneric_character_list);
1842   defsubr (&Sget_unused_iso_final_char);
1843   defsubr (&Sdeclare_equiv_charset);
1844   defsubr (&Sfind_charset_region);
1845   defsubr (&Sfind_charset_string);
1846   defsubr (&Smake_char_internal);
1847   defsubr (&Ssplit_char);
1848   defsubr (&Schar_charset);
1849   defsubr (&Scharset_after);
1850   defsubr (&Siso_charset);
1851   defsubr (&Schar_valid_p);
1852   defsubr (&Sunibyte_char_to_multibyte);
1853   defsubr (&Smultibyte_char_to_unibyte);
1854   defsubr (&Schar_bytes);
1855   defsubr (&Schar_width);
1856   defsubr (&Sstring_width);
1857   defsubr (&Schar_direction);
1858   defsubr (&Schars_in_region);
1859   defsubr (&Sstring);
1860   defsubr (&Ssetup_special_charsets);
1861
1862   DEFVAR_LISP ("charset-list", &Vcharset_list,
1863                doc: /* List of charsets ever defined.  */);
1864   Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1865                                         Fcons (Qeight_bit_graphic, Qnil)));
1866
1867   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1868                doc: /* Vector of cons cell of a symbol and translation table ever defined.
1869 An ID of a translation table is an index of this vector.  */);
1870   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1871
1872   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1873               doc: /* Leading-code of private TYPE9N charset of column-width 1.  */);
1874   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1875
1876   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1877               doc: /* Leading-code of private TYPE9N charset of column-width 2.  */);
1878   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1879
1880   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1881               doc: /* Leading-code of private TYPE9Nx9N charset of column-width 1.  */);
1882   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1883
1884   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1885               doc: /* Leading-code of private TYPE9Nx9N charset of column-width 2.  */);
1886   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1887
1888   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1889               doc: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.
1890 This is used for converting unibyte text to multibyte,
1891 and for inserting character codes specified by number.
1892
1893 This serves to convert a Latin-1 or similar 8-bit character code
1894 to the corresponding Emacs multibyte character code.
1895 Typically the value should be (- (make-char CHARSET 0) 128),
1896 for your choice of character set.
1897 If `nonascii-translation-table' is non-nil, it overrides this variable.  */);
1898   nonascii_insert_offset = 0;
1899
1900   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1901                doc: /* Translation table to convert non-ASCII unibyte codes to multibyte.
1902 This is used for converting unibyte text to multibyte,
1903 and for inserting character codes specified by number.
1904
1905 Conversion is performed only when multibyte characters are enabled,
1906 and it serves to convert a Latin-1 or similar 8-bit character code
1907 to the corresponding Emacs character code.
1908
1909 If this is nil, `nonascii-insert-offset' is used instead.
1910 See also the docstring of `make-translation-table'.  */);
1911   Vnonascii_translation_table = Qnil;
1912
1913   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1914                doc: /* A char-table for characters which invoke auto-filling.
1915 Such characters have value t in this table.  */);
1916   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1917   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1918   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1919 }
1920
1921 #endif /* emacs */
1922
1923 /* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f
1924    (do not change this comment) */