2 * UTF-8 utility functions
4 * (c) 2010 Steve Bennett <steveb@workware.net.au>
6 * See LICENCE for licence details.
16 /* This one is always implemented */
17 int utf8_fromunicode(char *p
, unsigned short uc
)
23 else if (uc
<= 0x7ff) {
24 *p
++ = 0xc0 | ((uc
& 0x7c0) >> 6);
25 *p
= 0x80 | (uc
& 0x3f);
29 *p
++ = 0xe0 | ((uc
& 0xf000) >> 12);
30 *p
++ = 0x80 | ((uc
& 0xfc0) >> 6);
31 *p
= 0x80 | (uc
& 0x3f);
36 #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP)
37 int utf8_charlen(int c
)
39 if ((c
& 0x80) == 0) {
42 if ((c
& 0xe0) == 0xc0) {
45 if ((c
& 0xf0) == 0xe0) {
48 if ((c
& 0xf8) == 0xf0) {
51 /* Invalid sequence */
55 int utf8_strlen(const char *str
, int bytelen
)
59 bytelen
= strlen(str
);
63 int l
= utf8_tounicode(str
, &c
);
71 int utf8_index(const char *str
, int index
)
76 s
+= utf8_tounicode(s
, &c
);
81 int utf8_charequal(const char *s1
, const char *s2
)
85 utf8_tounicode(s1
, &c1
);
86 utf8_tounicode(s2
, &c2
);
91 int utf8_prev_len(const char *str
, int len
)
97 /* Look up to len chars backward for a start-of-char byte */
99 if ((str
[-n
] & 0x80) == 0) {
100 /* Start of a 1-byte char */
103 if ((str
[-n
] & 0xc0) == 0xc0) {
104 /* Start of a multi-byte char */
112 int utf8_tounicode(const char *str
, int *uc
)
114 unsigned const char *s
= (unsigned const char *)str
;
121 if ((s
[1] & 0xc0) == 0x80) {
122 *uc
= ((s
[0] & ~0xc0) << 6) | (s
[1] & ~0x80);
126 else if (s
[0] < 0xf0) {
127 if (((str
[1] & 0xc0) == 0x80) && ((str
[2] & 0xc0) == 0x80)) {
128 *uc
= ((s
[0] & ~0xe0) << 12) | ((s
[1] & ~0x80) << 6) | (s
[2] & ~0x80);
133 /* Invalid sequence, so just return the byte */
139 unsigned short code
; /* code point */
140 signed char lowerdelta
; /* add for lowercase, or if -128 use the ext table */
141 signed char upperdelta
; /* add for uppercase, or offset into the ext table */
144 /* Extended table for codepoints where |delta| > 127 */
146 unsigned short lower
;
147 unsigned short upper
;
150 /* Generated mapping tables */
151 #include "_unicode_mapping.c"
153 #define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping)
155 static int cmp_casemap(const void *key
, const void *cm
)
157 return *(int *)key
- (int)((const struct casemap
*)cm
)->code
;
160 static int utf8_map_case(int uc
, int upper
)
162 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
165 if (cm
->lowerdelta
== -128) {
166 uc
= upper
? unicode_extmap
[cm
->upperdelta
].upper
: unicode_extmap
[cm
->upperdelta
].lower
;
169 uc
+= upper
? cm
->upperdelta
: cm
->lowerdelta
;
175 int utf8_upper(int uc
)
180 return utf8_map_case(uc
, 1);
183 int utf8_lower(int uc
)
189 return utf8_map_case(uc
, 0);
192 #endif /* JIM_BOOTSTRAP */