2 * Copyright 2004-2005 Timo Hirvonen
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28 const char hex_tab
[16] = "0123456789abcdef";
31 * Byte Sequence Min Min Max
32 * ----------------------------------------------------------------------------------
33 * 0xxxxxxx 0000000 0x00000 0x00007f
34 * 110xxxxx 10xxxxxx 000 10000000 0x00080 0x0007ff
35 * 1110xxxx 10xxxxxx 10xxxxxx 00001000 00000000 0x00800 0x00ffff
36 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00001 00000000 00000000 0x10000 0x10ffff (not 0x1fffff)
38 * max: 100 001111 111111 111111 (0x10ffff)
41 /* Length of UTF-8 byte sequence.
42 * Table index is the first byte of UTF-8 sequence.
44 static const signed char len_tab
[256] = {
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 /* 128-191 10xxxxxx (invalid first byte) */
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 /* 192-223 110xxxxx */
62 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
63 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
65 /* 224-239 1110xxxx */
66 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
68 /* 240-244 11110xxx (000 - 100) */
71 /* 11110xxx (101 - 111) (always invalid) */
74 /* 11111xxx (always invalid) */
75 -1, -1, -1, -1, -1, -1, -1, -1
78 /* index is length of the UTF-8 sequence - 1 */
79 static int min_val
[4] = { 0x000000, 0x000080, 0x000800, 0x010000 };
80 static int max_val
[4] = { 0x00007f, 0x0007ff, 0x00ffff, 0x10ffff };
82 /* get value bits from the first UTF-8 sequence byte */
83 static unsigned int first_byte_mask
[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
85 int u_is_valid(const char *str
)
87 const unsigned char *s
= (const unsigned char *)str
;
91 unsigned char ch
= s
[i
++];
92 int len
= len_tab
[ch
];
98 /* len - 1 10xxxxxx bytes */
103 u
= ch
& first_byte_mask
[len
];
107 if (len_tab
[ch
] != 0)
109 u
= (u
<< 6) | (ch
& 0x3f);
112 if (u
< min_val
[len
] || u
> max_val
[len
])
119 int u_strlen(const char *str
)
121 const unsigned char *s
= (const unsigned char *)str
;
127 if (unlikely(l
> 1)) {
128 /* next l - 1 bytes must be 0x10xxxxxx */
131 if (len_tab
[s
[c
]] != 0) {
132 /* invalid sequence */
145 * invalid chars counted as single characters */
152 int u_char_width(uchar u
)
154 if (unlikely(u
< 0x20))
160 /* Hangul Jamo init. consonants */
165 if (u
== 0x2329U
|| u
== 0x232aU
)
185 /* Hangul Syllables */
186 if (u
>= 0xac00U
&& u
<= 0xd7a3U
)
189 /* CJK Compatibility Ideographs */
190 if (u
>= 0xf900U
&& u
<= 0xfaffU
)
193 /* CJK Compatibility Forms */
194 if (u
>= 0xfe30U
&& u
<= 0xfe6fU
)
197 /* Fullwidth Forms */
198 if (u
>= 0xff00U
&& u
<= 0xff60U
)
201 /* Fullwidth Forms */
202 if (u
>= 0xffe0U
&& u
<= 0xffe6U
)
205 /* CJK extra stuff */
206 if (u
>= 0x20000U
&& u
<= 0x2fffdU
)
210 if (u
>= 0x30000U
&& u
<= 0x3fffdU
)
213 /* invalid bytes in unicode stream are rendered "<xx>" */
214 if (u
& U_INVALID_MASK
)
225 /* print control chars as <xx> */
230 int u_str_width(const char *str
)
237 u_get_char(str
, &idx
, &u
);
238 w
+= u_char_width(u
);
243 int u_str_nwidth(const char *str
, int len
)
250 u_get_char(str
, &idx
, &u
);
253 w
+= u_char_width(u
);
259 void u_prev_char_pos(const char *str
, int *idx
)
261 const unsigned char *s
= (const unsigned char *)str
;
262 int c
, len
, i
= *idx
;
268 /* start of byte sequence or invelid uchar */
275 /* first byte of the sequence is missing */
287 /* too long sequence */
291 /* incorrect length */
304 void u_get_char(const char *str
, int *idx
, uchar
*uch
)
306 const unsigned char *s
= (const unsigned char *)str
;
312 if (unlikely(len
< 1))
316 u
= ch
& first_byte_mask
[len
];
319 if (unlikely(len_tab
[ch
] != 0))
321 u
= (u
<< 6) | (ch
& 0x3f);
330 *uch
= u
| U_INVALID_MASK
;
334 void u_set_char_raw(char *str
, int *idx
, uchar uch
)
338 if (uch
<= 0x0000007fU
) {
341 } else if (uch
<= 0x000007ffU
) {
342 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
343 str
[i
+ 0] = uch
| 0x000000c0U
;
346 } else if (uch
<= 0x0000ffffU
) {
347 str
[i
+ 2] = (uch
& 63) | 0x80; uch
>>= 6;
348 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
349 str
[i
+ 0] = uch
| 0x000000e0U
;
352 } else if (uch
<= 0x0010ffffU
) {
353 str
[i
+ 3] = (uch
& 63) | 0x80; uch
>>= 6;
354 str
[i
+ 2] = (uch
& 63) | 0x80; uch
>>= 6;
355 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
356 str
[i
+ 0] = uch
| 0x000000f0U
;
360 /* must be an invalid uchar */
361 str
[i
++] = uch
& 0xff;
367 * Printing functions, these lose information
370 void u_set_char(char *str
, int *idx
, uchar uch
)
374 if (unlikely(uch
<= 0x0000001fU
))
377 if (uch
<= 0x0000007fU
) {
381 } else if (uch
<= 0x000007ffU
) {
382 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
383 str
[i
+ 0] = uch
| 0x000000c0U
;
387 } else if (uch
<= 0x0000ffffU
) {
388 str
[i
+ 2] = (uch
& 63) | 0x80; uch
>>= 6;
389 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
390 str
[i
+ 0] = uch
| 0x000000e0U
;
394 } else if (uch
<= 0x0010ffffU
) {
395 str
[i
+ 3] = (uch
& 63) | 0x80; uch
>>= 6;
396 str
[i
+ 2] = (uch
& 63) | 0x80; uch
>>= 6;
397 str
[i
+ 1] = (uch
& 63) | 0x80; uch
>>= 6;
398 str
[i
+ 0] = uch
| 0x000000f0U
;
404 /* control character or invalid unicode */
406 /* handle this special case here to make the common case fast */
411 str
[i
++] = hex_tab
[(uch
>> 4) & 0xf];
412 str
[i
++] = hex_tab
[uch
& 0xf];
418 int u_copy_chars(char *dst
, const char *src
, int *width
)
426 u_get_char(src
, &si
, &u
);
430 cw
= u_char_width(u
);
433 if (unlikely(w
< 0)) {
439 dst
[di
++] = hex_tab
[(u
>> 4) & 0xf];
441 dst
[di
++] = hex_tab
[u
& 0xf];
446 u_set_char(dst
, &di
, u
);
452 int u_skip_chars(const char *str
, int *width
)
460 u_get_char(str
, &idx
, &u
);
461 w
-= u_char_width(u
);
463 /* add 1..3 if skipped 'too much' (the last char was double width or invalid (<xx>)) */
469 * Comparison functions
472 static inline int chcasecmp(int a
, int b
)
474 return towupper(a
) - towupper(b
);
477 int u_strcasecmp(const char *a
, const char *b
)
486 u_get_char(a
, &ai
, &au
);
487 u_get_char(b
, &bi
, &bu
);
488 res
= chcasecmp(au
, bu
);
499 int u_strncasecmp(const char *a
, const char *b
, int len
)
508 u_get_char(a
, &ai
, &au
);
509 u_get_char(b
, &bi
, &bu
);
510 res
= chcasecmp(au
, bu
);
522 char *u_strcasestr(const char *haystack
, const char *needle
)
524 /* strlen is faster and works here */
525 int haystack_len
= strlen(haystack
);
526 int needle_len
= u_strlen(needle
);
532 if (haystack_len
< needle_len
)
534 if (u_strncasecmp(needle
, haystack
, needle_len
) == 0)
535 return (char *)haystack
;
539 u_get_char(haystack
, &idx
, &u
);