src/character.h

   1 /* Header for multibyte character handler.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H13PRO009
   7
   8 This file is part of GNU Emacs.
   9
  10 GNU Emacs is free software: you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation, either version 3 of the License, or
  13 (at your option) any later version.
  14
  15 GNU Emacs is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  22
  23 #ifndef EMACS_CHARACTER_H
  24 #define EMACS_CHARACTER_H
  25
  26 #include <verify.h>
  27
  28 INLINE_HEADER_BEGIN
  29
  30 /* character code       1st byte   byte sequence
  31    --------------       --------   -------------
  32         0-7F            00..7F     0xxxxxxx
  33        80-7FF           C2..DF     110xxxxx 10xxxxxx
  34       800-FFFF          E0..EF     1110xxxx 10xxxxxx 10xxxxxx
  35     10000-1FFFFF        F0..F7     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  36    200000-3FFF7F        F8         11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
  37    3FFF80-3FFFFF        C0..C1     1100000x 10xxxxxx (for eight-bit-char)
  38    400000-...           invalid
  39
  40    invalid 1st byte     80..BF     10xxxxxx
  41                         F9..FF     11111xxx (xxx != 000)
  42 */
  43
  44 /* Maximum character code ((1 << CHARACTERBITS) - 1).  */
  45 #define MAX_CHAR  0x3FFFFF
  46
  47 /* Maximum Unicode character code.  */
  48 #define MAX_UNICODE_CHAR 0x10FFFF
  49
  50 /* Maximum N-byte character codes.  */
  51 #define MAX_1_BYTE_CHAR 0x7F
  52 #define MAX_2_BYTE_CHAR 0x7FF
  53 #define MAX_3_BYTE_CHAR 0xFFFF
  54 #define MAX_4_BYTE_CHAR 0x1FFFFF
  55 #define MAX_5_BYTE_CHAR 0x3FFF7F
  56
  57 /* Minimum leading code of multibyte characters.  */
  58 #define MIN_MULTIBYTE_LEADING_CODE 0xC0
  59 /* Maximum leading code of multibyte characters.  */
  60 #define MAX_MULTIBYTE_LEADING_CODE 0xF8
  61
  62 /* Unicode character values.  */
  63 enum
  64 {
  65   NO_BREAK_SPACE = 0x00A0,
  66   SOFT_HYPHEN = 0x00AD,
  67   ZERO_WIDTH_NON_JOINER = 0x200C,
  68   ZERO_WIDTH_JOINER = 0x200D,
  69   HYPHEN = 0x2010,
  70   NON_BREAKING_HYPHEN = 0x2011,
  71   LEFT_SINGLE_QUOTATION_MARK = 0x2018,
  72   RIGHT_SINGLE_QUOTATION_MARK = 0x2019,
  73   PARAGRAPH_SEPARATOR = 0x2029,
  74   LEFT_POINTING_ANGLE_BRACKET = 0x2329,
  75   RIGHT_POINTING_ANGLE_BRACKET = 0x232A,
  76   LEFT_ANGLE_BRACKET = 0x3008,
  77   RIGHT_ANGLE_BRACKET = 0x3009,
  78   OBJECT_REPLACEMENT_CHARACTER = 0xFFFC,
  79 };
  80
  81 /* UTF-8 encodings.  Use \x escapes, so they are portable to pre-C11
  82    compilers and can be concatenated with ordinary string literals.  */
  83 #define uLSQM "\xE2\x80\x98" /* U+2018 LEFT SINGLE QUOTATION MARK */
  84 #define uRSQM "\xE2\x80\x99" /* U+2019 RIGHT SINGLE QUOTATION MARK */
  85
  86 /* Nonzero iff C is a character that corresponds to a raw 8-bit
  87    byte.  */
  88 #define CHAR_BYTE8_P(c) ((c) > MAX_5_BYTE_CHAR)
  89
  90 /* Return the character code for raw 8-bit byte BYTE.  */
  91 #define BYTE8_TO_CHAR(byte) ((byte) + 0x3FFF00)
  92
  93 #define UNIBYTE_TO_CHAR(byte) \
  94   (ASCII_CHAR_P (byte) ? (byte) : BYTE8_TO_CHAR (byte))
  95
  96 /* Return the raw 8-bit byte for character C.  */
  97 #define CHAR_TO_BYTE8(c) (CHAR_BYTE8_P (c) ? (c) - 0x3FFF00 : (c & 0xFF))
  98
  99 /* Return the raw 8-bit byte for character C,
 100    or -1 if C doesn't correspond to a byte.  */
 101 #define CHAR_TO_BYTE_SAFE(c)                                            \
 102   (ASCII_CHAR_P (c) ? c : (CHAR_BYTE8_P (c) ? (c) - 0x3FFF00 : -1))
 103
 104 /* Nonzero iff BYTE is the 1st byte of a multibyte form of a character
 105    that corresponds to a raw 8-bit byte.  */
 106 #define CHAR_BYTE8_HEAD_P(byte) ((byte) == 0xC0 || (byte) == 0xC1)
 107
 108 /* If C is not ASCII, make it unibyte. */
 109 #define MAKE_CHAR_UNIBYTE(c)    \
 110   do {                          \
 111     if (! ASCII_CHAR_P (c))     \
 112       c = CHAR_TO_BYTE8 (c);    \
 113   } while (false)
 114
 115
 116 /* If C is not ASCII, make it multibyte.  Assumes C < 256.  */
 117 #define MAKE_CHAR_MULTIBYTE(c) \
 118   (eassert ((c) >= 0 && (c) < 256), (c) = UNIBYTE_TO_CHAR (c))
 119
 120 /* This is the maximum byte length of multibyte form.  */
 121 #define MAX_MULTIBYTE_LENGTH 5
 122
 123 /* Nonzero iff X is a character.  */
 124 #define CHARACTERP(x) (NATNUMP (x) && XFASTINT (x) <= MAX_CHAR)
 125
 126 /* Nonzero iff C is valid as a character code.  */
 127 #define CHAR_VALID_P(c) UNSIGNED_CMP (c, <=, MAX_CHAR)
 128
 129 /* Check if Lisp object X is a character or not.  */
 130 #define CHECK_CHARACTER(x) \
 131   CHECK_TYPE (CHARACTERP (x), Qcharacterp, x)
 132
 133 #define CHECK_CHARACTER_CAR(x) \
 134   do {                                  \
 135     Lisp_Object tmp = XCAR (x);         \
 136     CHECK_CHARACTER (tmp);              \
 137     XSETCAR ((x), tmp);                 \
 138   } while (false)
 139
 140 #define CHECK_CHARACTER_CDR(x) \
 141   do {                                  \
 142     Lisp_Object tmp = XCDR (x);         \
 143     CHECK_CHARACTER (tmp);              \
 144     XSETCDR ((x), tmp);                 \
 145   } while (false)
 146
 147 /* Nonzero iff C is a character of code less than 0x100.  */
 148 #define SINGLE_BYTE_CHAR_P(c) UNSIGNED_CMP (c, <, 0x100)
 149
 150 /* Nonzero if character C has a printable glyph.  */
 151 #define CHAR_PRINTABLE_P(c)     \
 152   (((c) >= 32 && (c) < 127)     \
 153    || ! NILP (CHAR_TABLE_REF (Vprintable_chars, (c))))
 154
 155 /* Return byte length of multibyte form for character C.  */
 156 #define CHAR_BYTES(c)                   \
 157   ( (c) <= MAX_1_BYTE_CHAR ? 1          \
 158     : (c) <= MAX_2_BYTE_CHAR ? 2        \
 159     : (c) <= MAX_3_BYTE_CHAR ? 3        \
 160     : (c) <= MAX_4_BYTE_CHAR ? 4        \
 161     : (c) <= MAX_5_BYTE_CHAR ? 5        \
 162     : 2)
 163
 164
 165 /* Return the leading code of multibyte form of C.  */
 166 #define CHAR_LEADING_CODE(c)                            \
 167   ((c) <= MAX_1_BYTE_CHAR ? c                           \
 168    : (c) <= MAX_2_BYTE_CHAR ? (0xC0 | ((c) >> 6))       \
 169    : (c) <= MAX_3_BYTE_CHAR ? (0xE0 | ((c) >> 12))      \
 170    : (c) <= MAX_4_BYTE_CHAR ? (0xF0 | ((c) >> 18))      \
 171    : (c) <= MAX_5_BYTE_CHAR ? 0xF8                      \
 172    : (0xC0 | (((c) >> 6) & 0x01)))
 173
 174
 175 /* Store multibyte form of the character C in P.  The caller should
 176    allocate at least MAX_MULTIBYTE_LENGTH bytes area at P in advance.
 177    Returns the length of the multibyte form.  */
 178
 179 #define CHAR_STRING(c, p)                       \
 180   (UNSIGNED_CMP (c, <=, MAX_1_BYTE_CHAR)        \
 181    ? ((p)[0] = (c),                             \
 182       1)                                        \
 183    : UNSIGNED_CMP (c, <=, MAX_2_BYTE_CHAR)      \
 184    ? ((p)[0] = (0xC0 | ((c) >> 6)),             \
 185       (p)[1] = (0x80 | ((c) & 0x3F)),           \
 186       2)                                        \
 187    : UNSIGNED_CMP (c, <=, MAX_3_BYTE_CHAR)      \
 188    ? ((p)[0] = (0xE0 | ((c) >> 12)),            \
 189       (p)[1] = (0x80 | (((c) >> 6) & 0x3F)),    \
 190       (p)[2] = (0x80 | ((c) & 0x3F)),           \
 191       3)                                        \
 192    : verify_expr (sizeof (c) <= sizeof (unsigned), char_string (c, p)))
 193
 194 /* Store multibyte form of byte B in P.  The caller should allocate at
 195    least MAX_MULTIBYTE_LENGTH bytes area at P in advance.  Returns the
 196    length of the multibyte form.  */
 197
 198 #define BYTE8_STRING(b, p)                      \
 199   ((p)[0] = (0xC0 | (((b) >> 6) & 0x01)),       \
 200    (p)[1] = (0x80 | ((b) & 0x3F)),              \
 201    2)
 202
 203
 204 /* Store multibyte form of the character C in P and advance P to the
 205    end of the multibyte form.  The caller should allocate at least
 206    MAX_MULTIBYTE_LENGTH bytes area at P in advance.  */
 207
 208 #define CHAR_STRING_ADVANCE(c, p)               \
 209   do {                                          \
 210     if ((c) <= MAX_1_BYTE_CHAR)                 \
 211       *(p)++ = (c);                             \
 212     else if ((c) <= MAX_2_BYTE_CHAR)            \
 213       *(p)++ = (0xC0 | ((c) >> 6)),             \
 214         *(p)++ = (0x80 | ((c) & 0x3F));         \
 215     else if ((c) <= MAX_3_BYTE_CHAR)            \
 216       *(p)++ = (0xE0 | ((c) >> 12)),            \
 217         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 218         *(p)++ = (0x80 | ((c) & 0x3F));         \
 219     else                                        \
 220       {                                         \
 221         verify (sizeof (c) <= sizeof (unsigned));       \
 222         (p) += char_string (c, p);              \
 223       }                                         \
 224   } while (false)
 225
 226
 227 /* Nonzero iff BYTE starts a non-ASCII character in a multibyte
 228    form.  */
 229 #define LEADING_CODE_P(byte) (((byte) & 0xC0) == 0xC0)
 230
 231 /* Nonzero iff BYTE is a trailing code of a non-ASCII character in a
 232    multibyte form.  */
 233 #define TRAILING_CODE_P(byte) (((byte) & 0xC0) == 0x80)
 234
 235 /* Nonzero iff BYTE starts a character in a multibyte form.
 236    This is equivalent to:
 237         (ASCII_CHAR_P (byte) || LEADING_CODE_P (byte))  */
 238 #define CHAR_HEAD_P(byte) (((byte) & 0xC0) != 0x80)
 239
 240 /* How many bytes a character that starts with BYTE occupies in a
 241    multibyte form.  */
 242 #define BYTES_BY_CHAR_HEAD(byte)        \
 243   (!((byte) & 0x80) ? 1                 \
 244    : !((byte) & 0x20) ? 2               \
 245    : !((byte) & 0x10) ? 3               \
 246    : !((byte) & 0x08) ? 4               \
 247    : 5)
 248
 249
 250 /* The byte length of multibyte form at unibyte string P ending at
 251    PEND.  If STR doesn't point to a valid multibyte form, return 0.  */
 252
 253 #define MULTIBYTE_LENGTH(p, pend)                               \
 254   (p >= pend ? 0                                                \
 255    : !((p)[0] & 0x80) ? 1                                       \
 256    : ((p + 1 >= pend) || (((p)[1] & 0xC0) != 0x80)) ? 0         \
 257    : ((p)[0] & 0xE0) == 0xC0 ? 2                                \
 258    : ((p + 2 >= pend) || (((p)[2] & 0xC0) != 0x80)) ? 0         \
 259    : ((p)[0] & 0xF0) == 0xE0 ? 3                                \
 260    : ((p + 3 >= pend) || (((p)[3] & 0xC0) != 0x80)) ? 0         \
 261    : ((p)[0] & 0xF8) == 0xF0 ? 4                                \
 262    : ((p + 4 >= pend) || (((p)[4] & 0xC0) != 0x80)) ? 0         \
 263    : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5              \
 264    : 0)
 265
 266
 267 /* Like MULTIBYTE_LENGTH, but don't check the ending address.  */
 268
 269 #define MULTIBYTE_LENGTH_NO_CHECK(p)                    \
 270   (!((p)[0] & 0x80) ? 1                                 \
 271    : ((p)[1] & 0xC0) != 0x80 ? 0                        \
 272    : ((p)[0] & 0xE0) == 0xC0 ? 2                        \
 273    : ((p)[2] & 0xC0) != 0x80 ? 0                        \
 274    : ((p)[0] & 0xF0) == 0xE0 ? 3                        \
 275    : ((p)[3] & 0xC0) != 0x80 ? 0                        \
 276    : ((p)[0] & 0xF8) == 0xF0 ? 4                        \
 277    : ((p)[4] & 0xC0) != 0x80 ? 0                        \
 278    : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5      \
 279    : 0)
 280
 281 /* If P is before LIMIT, advance P to the next character boundary.
 282    Assumes that P is already at a character boundary of the same
 283    multibyte form whose end address is LIMIT.  */
 284
 285 #define NEXT_CHAR_BOUNDARY(p, limit)    \
 286   do {                                  \
 287     if ((p) < (limit))                  \
 288       (p) += BYTES_BY_CHAR_HEAD (*(p)); \
 289   } while (false)
 290
 291
 292 /* If P is after LIMIT, advance P to the previous character boundary.
 293    Assumes that P is already at a character boundary of the same
 294    multibyte form whose beginning address is LIMIT.  */
 295
 296 #define PREV_CHAR_BOUNDARY(p, limit)                                    \
 297   do {                                                                  \
 298     if ((p) > (limit))                                                  \
 299       {                                                                 \
 300         const unsigned char *chp = (p);                                 \
 301         do {                                                            \
 302           chp--;                                                        \
 303         } while (chp >= limit && ! CHAR_HEAD_P (*chp));                 \
 304         (p) = (BYTES_BY_CHAR_HEAD (*chp) == (p) - chp) ? chp : (p) - 1; \
 305       }                                                                 \
 306   } while (false)
 307
 308 /* Return the character code of character whose multibyte form is at
 309    P.  Note that this macro unifies CJK characters whose codepoints
 310    are in the Private Use Areas (PUAs), so it might return a different
 311    codepoint from the one actually stored at P.  */
 312
 313 #define STRING_CHAR(p)                                          \
 314   (!((p)[0] & 0x80)                                             \
 315    ? (p)[0]                                                     \
 316    : ! ((p)[0] & 0x20)                                          \
 317    ? (((((p)[0] & 0x1F) << 6)                                   \
 318        | ((p)[1] & 0x3F))                                       \
 319       + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0))       \
 320    : ! ((p)[0] & 0x10)                                          \
 321    ? ((((p)[0] & 0x0F) << 12)                                   \
 322       | (((p)[1] & 0x3F) << 6)                                  \
 323       | ((p)[2] & 0x3F))                                        \
 324    : string_char ((p), NULL, NULL))
 325
 326
 327 /* Like STRING_CHAR, but set ACTUAL_LEN to the length of multibyte
 328    form.
 329
 330    Note: This macro returns the actual length of the character's
 331    multibyte sequence as it is stored in a buffer or string.  The
 332    character it returns might have a different codepoint that has a
 333    different multibyte sequence of a different length, due to possible
 334    unification of CJK characters inside string_char.  Therefore do NOT
 335    assume that the length returned by this macro is identical to the
 336    length of the multibyte sequence of the character it returns.  */
 337
 338 #define STRING_CHAR_AND_LENGTH(p, actual_len)                   \
 339   (!((p)[0] & 0x80)                                             \
 340    ? ((actual_len) = 1, (p)[0])                                 \
 341    : ! ((p)[0] & 0x20)                                          \
 342    ? ((actual_len) = 2,                                         \
 343       (((((p)[0] & 0x1F) << 6)                                  \
 344         | ((p)[1] & 0x3F))                                      \
 345        + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0)))     \
 346    : ! ((p)[0] & 0x10)                                          \
 347    ? ((actual_len) = 3,                                         \
 348       ((((p)[0] & 0x0F) << 12)                                  \
 349        | (((p)[1] & 0x3F) << 6)                                 \
 350        | ((p)[2] & 0x3F)))                                      \
 351    : string_char ((p), NULL, &actual_len))
 352
 353
 354 /* Like STRING_CHAR, but advance P to the end of multibyte form.  */
 355
 356 #define STRING_CHAR_ADVANCE(p)                                  \
 357   (!((p)[0] & 0x80)                                             \
 358    ? *(p)++                                                     \
 359    : ! ((p)[0] & 0x20)                                          \
 360    ? ((p) += 2,                                                 \
 361       ((((p)[-2] & 0x1F) << 6)                                  \
 362        | ((p)[-1] & 0x3F)                                       \
 363        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 364    : ! ((p)[0] & 0x10)                                          \
 365    ? ((p) += 3,                                                 \
 366       ((((p)[-3] & 0x0F) << 12)                                 \
 367        | (((p)[-2] & 0x3F) << 6)                                \
 368        | ((p)[-1] & 0x3F)))                                     \
 369    : string_char ((p), &(p), NULL))
 370
 371
 372 /* Fetch the "next" character from Lisp string STRING at byte position
 373    BYTEIDX, character position CHARIDX.  Store it into OUTPUT.
 374
 375    All the args must be side-effect-free.
 376    BYTEIDX and CHARIDX must be lvalues;
 377    we increment them past the character fetched.  */
 378
 379 #define FETCH_STRING_CHAR_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX)     \
 380   do                                                                    \
 381     {                                                                   \
 382       CHARIDX++;                                                        \
 383       if (STRING_MULTIBYTE (STRING))                                    \
 384         {                                                               \
 385           unsigned char *chp = &SDATA (STRING)[BYTEIDX];                \
 386           int chlen;                                                    \
 387                                                                         \
 388           OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);                 \
 389           BYTEIDX += chlen;                                             \
 390         }                                                               \
 391       else                                                              \
 392         {                                                               \
 393           OUTPUT = SREF (STRING, BYTEIDX);                              \
 394           BYTEIDX++;                                                    \
 395         }                                                               \
 396     }                                                                   \
 397   while (false)
 398
 399 /* Like FETCH_STRING_CHAR_ADVANCE, but return a multibyte character
 400    even if STRING is unibyte.  */
 401
 402 #define FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX) \
 403   do                                                                          \
 404     {                                                                         \
 405       CHARIDX++;                                                              \
 406       if (STRING_MULTIBYTE (STRING))                                          \
 407         {                                                                     \
 408           unsigned char *chp = &SDATA (STRING)[BYTEIDX];                      \
 409           int chlen;                                                          \
 410                                                                               \
 411           OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);                       \
 412           BYTEIDX += chlen;                                                   \
 413         }                                                                     \
 414       else                                                                    \
 415         {                                                                     \
 416           OUTPUT = SREF (STRING, BYTEIDX);                                    \
 417           BYTEIDX++;                                                          \
 418           MAKE_CHAR_MULTIBYTE (OUTPUT);                                       \
 419         }                                                                     \
 420     }                                                                         \
 421   while (false)
 422
 423
 424 /* Like FETCH_STRING_CHAR_ADVANCE, but assumes STRING is multibyte.  */
 425
 426 #define FETCH_STRING_CHAR_ADVANCE_NO_CHECK(OUTPUT, STRING, CHARIDX, BYTEIDX) \
 427   do                                                                         \
 428     {                                                                        \
 429       unsigned char *fetch_ptr = &SDATA (STRING)[BYTEIDX];                   \
 430       int fetch_len;                                                         \
 431                                                                              \
 432       OUTPUT = STRING_CHAR_AND_LENGTH (fetch_ptr, fetch_len);                \
 433       BYTEIDX += fetch_len;                                                  \
 434       CHARIDX++;                                                             \
 435     }                                                                        \
 436   while (false)
 437
 438
 439 /* Like FETCH_STRING_CHAR_ADVANCE, but fetch character from the current
 440    buffer.  */
 441
 442 #define FETCH_CHAR_ADVANCE(OUTPUT, CHARIDX, BYTEIDX)            \
 443   do                                                            \
 444     {                                                           \
 445       CHARIDX++;                                                \
 446       if (!NILP (BVAR (current_buffer, enable_multibyte_characters)))   \
 447         {                                                       \
 448           unsigned char *chp = BYTE_POS_ADDR (BYTEIDX);         \
 449           int chlen;                                            \
 450                                                                 \
 451           OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);         \
 452           BYTEIDX += chlen;                                     \
 453         }                                                       \
 454       else                                                      \
 455         {                                                       \
 456           OUTPUT = *(BYTE_POS_ADDR (BYTEIDX));                  \
 457           BYTEIDX++;                                            \
 458         }                                                       \
 459     }                                                           \
 460   while (false)
 461
 462
 463 /* Like FETCH_CHAR_ADVANCE, but assumes the current buffer is multibyte.  */
 464
 465 #define FETCH_CHAR_ADVANCE_NO_CHECK(OUTPUT, CHARIDX, BYTEIDX)   \
 466   do                                                            \
 467     {                                                           \
 468       unsigned char *chp = BYTE_POS_ADDR (BYTEIDX);             \
 469       int chlen;                                                        \
 470                                                                 \
 471       OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);             \
 472       BYTEIDX += chlen;                                         \
 473       CHARIDX++;                                                \
 474     }                                                           \
 475   while (false)
 476
 477
 478 /* Increment the buffer byte position POS_BYTE of the current buffer to
 479    the next character boundary.  No range checking of POS.  */
 480
 481 #define INC_POS(pos_byte)                               \
 482   do {                                                  \
 483     unsigned char *chp = BYTE_POS_ADDR (pos_byte);      \
 484     pos_byte += BYTES_BY_CHAR_HEAD (*chp);              \
 485   } while (false)
 486
 487
 488 /* Decrement the buffer byte position POS_BYTE of the current buffer to
 489    the previous character boundary.  No range checking of POS.  */
 490
 491 #define DEC_POS(pos_byte)                       \
 492   do {                                          \
 493     unsigned char *chp;                         \
 494                                                 \
 495     pos_byte--;                                 \
 496     if (pos_byte < GPT_BYTE)                    \
 497       chp = BEG_ADDR + pos_byte - BEG_BYTE;     \
 498     else                                        \
 499       chp = BEG_ADDR + GAP_SIZE + pos_byte - BEG_BYTE; \
 500     while (!CHAR_HEAD_P (*chp))                 \
 501       {                                         \
 502         chp--;                                  \
 503         pos_byte--;                             \
 504       }                                         \
 505   } while (false)
 506
 507 /* Increment both CHARPOS and BYTEPOS, each in the appropriate way.  */
 508
 509 #define INC_BOTH(charpos, bytepos)                              \
 510   do                                                            \
 511     {                                                           \
 512       (charpos)++;                                              \
 513       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))    \
 514         (bytepos)++;                                            \
 515       else                                                      \
 516         INC_POS ((bytepos));                                    \
 517     }                                                           \
 518   while (false)
 519
 520
 521 /* Decrement both CHARPOS and BYTEPOS, each in the appropriate way.  */
 522
 523 #define DEC_BOTH(charpos, bytepos)                              \
 524   do                                                            \
 525     {                                                           \
 526       (charpos)--;                                              \
 527       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))    \
 528         (bytepos)--;                                            \
 529       else                                                      \
 530         DEC_POS ((bytepos));                                    \
 531     }                                                           \
 532   while (false)
 533
 534
 535 /* Increment the buffer byte position POS_BYTE of the current buffer to
 536    the next character boundary.  This macro relies on the fact that
 537    *GPT_ADDR and *Z_ADDR are always accessible and the values are
 538    '\0'.  No range checking of POS_BYTE.  */
 539
 540 #define BUF_INC_POS(buf, pos_byte)                              \
 541   do {                                                          \
 542     unsigned char *chp = BUF_BYTE_ADDRESS (buf, pos_byte);      \
 543     pos_byte += BYTES_BY_CHAR_HEAD (*chp);                      \
 544   } while (false)
 545
 546
 547 /* Decrement the buffer byte position POS_BYTE of the current buffer to
 548    the previous character boundary.  No range checking of POS_BYTE.  */
 549
 550 #define BUF_DEC_POS(buf, pos_byte)                                      \
 551   do {                                                                  \
 552     unsigned char *chp;                                                 \
 553     pos_byte--;                                                         \
 554     if (pos_byte < BUF_GPT_BYTE (buf))                                  \
 555       chp = BUF_BEG_ADDR (buf) + pos_byte - BEG_BYTE;                   \
 556     else                                                                \
 557       chp = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - BEG_BYTE;\
 558     while (!CHAR_HEAD_P (*chp))                                         \
 559       {                                                                 \
 560         chp--;                                                          \
 561         pos_byte--;                                                     \
 562       }                                                                 \
 563   } while (false)
 564
 565
 566 /* Return a non-outlandish value for the tab width.  */
 567
 568 #define SANE_TAB_WIDTH(buf) \
 569   sanitize_tab_width (XFASTINT (BVAR (buf, tab_width)))
 570 INLINE int
 571 sanitize_tab_width (EMACS_INT width)
 572 {
 573   return 0 < width && width <= 1000 ? width : 8;
 574 }
 575
 576 /* Return the width of ASCII character C.  The width is measured by
 577    how many columns C will occupy on the screen when displayed in the
 578    current buffer.  */
 579
 580 #define ASCII_CHAR_WIDTH(c)                                             \
 581   (c < 0x20                                                             \
 582    ? (c == '\t'                                                         \
 583       ? SANE_TAB_WIDTH (current_buffer)                                 \
 584       : (c == '\n' ? 0 : (NILP (BVAR (current_buffer, ctl_arrow)) ? 4 : 2)))    \
 585    : (c < 0x7f                                                          \
 586       ? 1                                                               \
 587       : ((NILP (BVAR (current_buffer, ctl_arrow)) ? 4 : 2))))
 588
 589 /* Return a non-outlandish value for a character width.  */
 590
 591 INLINE int
 592 sanitize_char_width (EMACS_INT width)
 593 {
 594   return 0 <= width && width <= 1000 ? width : 1000;
 595 }
 596
 597 /* Return the width of character C.  The width is measured by how many
 598    columns C will occupy on the screen when displayed in the current
 599    buffer.  */
 600
 601 #define CHAR_WIDTH(c)           \
 602   (ASCII_CHAR_P (c)             \
 603    ? ASCII_CHAR_WIDTH (c)       \
 604    : sanitize_char_width (XINT (CHAR_TABLE_REF (Vchar_width_table, c))))
 605
 606 /* If C is a variation selector, return the index of the
 607    variation selector (1..256).  Otherwise, return 0.  */
 608
 609 #define CHAR_VARIATION_SELECTOR_P(c)            \
 610   ((c) < 0xFE00 ? 0                             \
 611    : (c) <= 0xFE0F ? (c) - 0xFE00 + 1           \
 612    : (c) < 0xE0100 ? 0                          \
 613    : (c) <= 0xE01EF ? (c) - 0xE0100 + 17        \
 614    : 0)
 615
 616 /* If C is a high surrogate, return 1.  If C is a low surrogate,
 617    return 2.  Otherwise, return 0.  */
 618
 619 #define CHAR_SURROGATE_PAIR_P(c)        \
 620   ((c) < 0xD800 ? 0                     \
 621    : (c) <= 0xDBFF ? 1                  \
 622    : (c) <= 0xDFFF ? 2                  \
 623    : 0)
 624
 625 /* Data type for Unicode general category.
 626
 627    The order of members must be in sync with the 8th element of the
 628    member of unidata-prop-alist (in admin/unidata/unidata-gen.el) for
 629    Unicode character property `general-category'.  */
 630
 631 typedef enum {
 632   UNICODE_CATEGORY_UNKNOWN = 0,
 633   UNICODE_CATEGORY_Lu,
 634   UNICODE_CATEGORY_Ll,
 635   UNICODE_CATEGORY_Lt,
 636   UNICODE_CATEGORY_Lm,
 637   UNICODE_CATEGORY_Lo,
 638   UNICODE_CATEGORY_Mn,
 639   UNICODE_CATEGORY_Mc,
 640   UNICODE_CATEGORY_Me,
 641   UNICODE_CATEGORY_Nd,
 642   UNICODE_CATEGORY_Nl,
 643   UNICODE_CATEGORY_No,
 644   UNICODE_CATEGORY_Pc,
 645   UNICODE_CATEGORY_Pd,
 646   UNICODE_CATEGORY_Ps,
 647   UNICODE_CATEGORY_Pe,
 648   UNICODE_CATEGORY_Pi,
 649   UNICODE_CATEGORY_Pf,
 650   UNICODE_CATEGORY_Po,
 651   UNICODE_CATEGORY_Sm,
 652   UNICODE_CATEGORY_Sc,
 653   UNICODE_CATEGORY_Sk,
 654   UNICODE_CATEGORY_So,
 655   UNICODE_CATEGORY_Zs,
 656   UNICODE_CATEGORY_Zl,
 657   UNICODE_CATEGORY_Zp,
 658   UNICODE_CATEGORY_Cc,
 659   UNICODE_CATEGORY_Cf,
 660   UNICODE_CATEGORY_Cs,
 661   UNICODE_CATEGORY_Co,
 662   UNICODE_CATEGORY_Cn
 663 } unicode_category_t;
 664
 665 extern EMACS_INT char_resolve_modifier_mask (EMACS_INT) ATTRIBUTE_CONST;
 666 extern int char_string (unsigned, unsigned char *);
 667 extern int string_char (const unsigned char *,
 668                         const unsigned char **, int *);
 669
 670 extern int translate_char (Lisp_Object, int c);
 671 extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t);
 672 extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
 673                                    ptrdiff_t *);
 674 extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
 675 extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
 676 extern ptrdiff_t str_to_unibyte (const unsigned char *, unsigned char *,
 677                                  ptrdiff_t);
 678 extern ptrdiff_t strwidth (const char *, ptrdiff_t);
 679 extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
 680                                  ptrdiff_t *, ptrdiff_t *);
 681 extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
 682                                     ptrdiff_t *, ptrdiff_t *);
 683
 684 extern Lisp_Object Vchar_unify_table;
 685 extern Lisp_Object string_escape_byte8 (Lisp_Object);
 686
 687 extern bool alphabeticp (int);
 688 extern bool decimalnump (int);
 689 extern bool graphicp (int);
 690 extern bool printablep (int);
 691
 692 /* Return a translation table of id number ID.  */
 693 #define GET_TRANSLATION_TABLE(id) \
 694   (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
 695
 696 /* Look up the element in char table OBJ at index CH, and return it as
 697    an integer.  If the element is not a character, return CH itself.  */
 698
 699 INLINE int
 700 char_table_translate (Lisp_Object obj, int ch)
 701 {
 702   /* This internal function is expected to be called with valid arguments,
 703      so there is a eassert instead of CHECK_xxx for the sake of speed.  */
 704   eassert (CHAR_VALID_P (ch));
 705   eassert (CHAR_TABLE_P (obj));
 706   obj = CHAR_TABLE_REF (obj, ch);
 707   return CHARACTERP (obj) ? XINT (obj) : ch;
 708 }
 709
 710 INLINE_HEADER_END
 711
 712 #endif /* EMACS_CHARACTER_H */