* docs/pmc.pod:
[parrot.git] / src / unicode.h
blobdcde2a5a4317366af5755a6445a28018f7a2cadb
1 /* unicode.h
2 * Copyright (C) 2001-2006, The Perl Foundation.
3 * SVN Info
4 * $Id$
5 * Overview:
6 * Unicode support header
7 * Data Structure and Algorithms:
8 * History:
9 * Notes:
10 * References:
13 #if !defined(PARROT_UNICODE_H_GUARD)
14 #define PARROT_UNICODE_H_GUARD
16 typedef unsigned char utf8_t;
17 typedef unsigned short utf16_t;
18 typedef unsigned long utf32_t;
20 #define UNICODE_SURROGATE_FIRST 0xD800u
21 #define UNICODE_SURROGATE_LAST 0xDFFFu
22 #define UNICODE_HIGH_SURROGATE_FIRST 0xD800u
23 #define UNICODE_HIGH_SURROGATE_LAST 0xDBFFu
24 #define UNICODE_HIGH_SURROGATE_SHIFT 10
25 #define UNICODE_LOW_SURROGATE_FIRST 0xDC00u
26 #define UNICODE_LOW_SURROGATE_LAST 0xDFFFu
27 #define UNICODE_LOW_SURROGATE_MASK 0x3FFu
29 #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
30 (c) <= UNICODE_SURROGATE_LAST)
31 #define UNICODE_IS_HIGH_SURROGATE(c) ((c) >= UNICODE_HIGH_SURROGATE_FIRST && \
32 (c) <= UNICODE_HIGH_SURROGATE_LAST)
33 #define UNICODE_IS_LOW_SURROGATE(c) ((c) >= UNICODE_LOW_SURROGATE_FIRST && \
34 (c) <= UNICODE_LOW_SURROGATE_LAST)
35 #define UNICODE_IS_INVARIANT(c) ((c) < 0x80u)
37 #define UNICODE_HIGH_SURROGATE(c) \
38 ((((c) - 0x10000u) >> UNICODE_HIGH_SURROGATE_SHIFT) + UNICODE_HIGH_SURROGATE_FIRST)
39 #define UNICODE_LOW_SURROGATE(c) \
40 ((((c) - 0x10000u) & UNICODE_LOW_SURROGATE_MASK) + UNICODE_LOW_SURROGATE_FIRST)
41 #define UNICODE_DECODE_SURROGATE(high,low) \
42 ((((high) - UNICODE_HIGH_SURROGATE_FIRST) << UNICODE_HIGH_SURROGATE_SHIFT) + \
43 ((low) - UNICODE_LOW_SURROGATE_FIRST) + 0x10000u)
45 #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
46 (uv) < 0x800 ? 2 : \
47 (uv) < 0x10000 ? 3 : 4 )
49 #define UTF16SKIP(s) ( UNICODE_IS_HIGH_SURROGATE(*(s)) ? 2 : 1 )
53 The following table is from Unicode 3.1.
55 Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
57 U+0000..U+007F 00..7F
58 U+0080..U+07FF C2..DF 80..BF
59 U+0800..U+0FFF E0 A0..BF 80..BF
60 U+1000..U+FFFF E1..EF 80..BF 80..BF
61 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
62 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
63 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
67 #define UTF8_IS_START(c) ((c) >= 0xC0u && (c) <= 0xFDu)
68 #define UTF8_IS_CONTINUATION(c) ((c) >= 0x80u && (c) <= 0xBFu)
69 #define UTF8_IS_CONTINUED(c) ((c) & 0x80u)
71 #define UTF8_START_MARK(len) (len == 1 ? 0 : 0x7Eu << (7-len))
72 #define UTF8_START_MASK(len) (len == 1 ? 0x7Fu : 0x1Fu >> (len-2))
74 #define UTF8_CONTINUATION_MARK 0x80u
75 #define UTF8_ACCUMULATION_SHIFT 6
76 #define UTF8_CONTINUATION_MASK 0x3Fu
77 #define UTF8_ACCUMULATE(old, new) (((old) << UTF8_ACCUMULATION_SHIFT) | ((new) & UTF8_CONTINUATION_MASK))
79 extern const char Parrot_utf8skip[256];
81 #define UTF8SKIP(s) Parrot_utf8skip[*(s)]
83 #define UTF8_MAXLEN 4
84 #define UTF16_MAXLEN 4
86 #endif /* PARROT_UNICODE_H_GUARD */
90 * Local variables:
91 * c-file-style: "parrot"
92 * End:
93 * vim: expandtab shiftwidth=4: